libokj commited on
Commit
c17cba8
·
1 Parent(s): 32e688d
app/db.py CHANGED
@@ -9,7 +9,7 @@ import pytz
9
  import requests
10
  from tinydb import TinyDB, where
11
 
12
- from app.utils import send_email
13
 
14
  SERVER_DATA_DIR = os.getenv('DATA', 'results')
15
  DB_EXPIRY = timedelta(hours=48).total_seconds()
 
9
  import requests
10
  from tinydb import TinyDB, where
11
 
12
+ from app.fn import send_email
13
 
14
  SERVER_DATA_DIR = os.getenv('DATA', 'results')
15
  DB_EXPIRY = timedelta(hours=48).total_seconds()
app/{utils.py → fn.py} RENAMED
@@ -644,8 +644,6 @@ def download_file(url):
644
  return None
645
 
646
 
647
-
648
-
649
  def uniprot_to_pdb(uniprot_id):
650
  """Queries the RCSB PDB API to find PDB entities associated with a UniProt ID."""
651
  base_url = "https://search.rcsb.org/rcsbsearch/v2/query"
 
644
  return None
645
 
646
 
 
 
647
  def uniprot_to_pdb(uniprot_id):
648
  """Queries the RCSB PDB API to find PDB entities associated with a UniProt ID."""
649
  base_url = "https://search.rcsb.org/rcsbsearch/v2/query"
app/main.py CHANGED
@@ -1,15 +1,10 @@
1
  import uuid
2
  import zipfile
3
  from datetime import datetime
4
- from email.mime.multipart import MIMEMultipart
5
- from email.mime.text import MIMEText
6
- from email.utils import formatdate, make_msgid
7
  from pathlib import Path
8
  from time import sleep, time
9
 
10
- import requests
11
  import torch
12
- import yaml
13
  from email_validator import validate_email, EmailNotValidError
14
  from Bio import SeqIO
15
  import gradio as gr
@@ -18,11 +13,10 @@ from omegaconf import OmegaConf
18
  import pandas as pd
19
  from rdkit import Chem
20
  from rdkit.Chem import PandasTools
21
- from tinydb import where
22
 
23
  from inference import (read_fragment_library, process_fragment_library, extract_pockets,
24
  dock_fragments, generate_linkers, select_fragment_pairs)
25
- from app import static, utils, db
26
 
27
 
28
  gr.set_static_paths(paths=["data/", "results/"])
@@ -65,7 +59,7 @@ def process_drug_library_upload(library_upload):
65
  )
66
  else:
67
  raise gr.Error('Current supported fragment library formats only include CSV and SDF files.')
68
- utils.validate_columns(df, ['X1'])
69
  return df
70
 
71
 
@@ -511,8 +505,8 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
511
  result_protein_file = gr.File(visible=False, interactive=False)
512
  with gr.Column(variant='panel'):
513
  with gr.Row():
514
- scores = gr.CheckboxGroup(list(utils.SCORE_MAP.keys()), label='Compound Scores')
515
- filters = gr.CheckboxGroup(list(utils.FILTER_MAP.keys()), label='Compound Filters')
516
  with gr.Row():
517
  prop_clr_btn = gr.ClearButton(value='Clear Properties', interactive=False)
518
  prop_calc_btn = gr.Button(value='Calculate Properties', interactive=False)
@@ -592,7 +586,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
592
  }
593
  elif filepath.suffix == '.fasta':
594
  seq = next(SeqIO.parse(file, 'fasta')).seq
595
- filepath = utils.pdb_query(seq, method='FASTA Sequence')
596
  return {
597
  input_prot_file: gr.File(str(filepath), visible=True),
598
  prot_query_input: seq,
@@ -606,7 +600,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
606
  # outputs=[prot_file],
607
  # )
608
  # prot_file.change(
609
- # fn=lambda file: gr.HTML(utils.create_complex_view_html(file), visible=True),
610
  # inputs=[prot_file],
611
  # outputs=[input_prot_view],
612
  # )
@@ -618,7 +612,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
618
  )
619
 
620
  prot_query_btn.click(
621
- fn=utils.pdb_query,
622
  inputs=[prot_query_input, prot_query_dropdown],
623
  outputs=[input_prot_file],
624
  )
@@ -640,7 +634,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
640
  # pocket_extract_btn.click(
641
  # fn=lambda: gr.Info('Extracting pocket...'),
642
  # ).then(
643
- # fn=utils.extract_pockets_and_update_view,
644
  # js=static.RETURN_LIGAND_SELECTION_JS,
645
  # inputs=[prot_file, selected_ligand],
646
  # outputs=[input_prot_view, pocket_path_dict, selected_ligand, selected_pocket],
@@ -759,11 +753,11 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
759
  try:
760
  for filter_name in filter_list:
761
  mod_df[filter_name] = mod_df['Compound'].parallel_apply(
762
- lambda x: utils.FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
763
 
764
  for score_name in score_list:
765
  mod_df[score_name] = mod_df['Compound'].parallel_apply(
766
- lambda x: utils.SCORE_MAP[score_name](x) if not pd.isna(x) else x)
767
 
768
  return {result_table_mod_df: mod_df}
769
 
@@ -784,7 +778,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
784
  outputs=[result_protein_file],
785
  )
786
  result_table_mod_df.change(
787
- fn=utils.create_result_table_html,
788
  inputs=[result_table_mod_df],
789
  outputs=[result_table_view]
790
  ).success(
@@ -828,4 +822,5 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
828
  demo.launch(
829
  server_name='0.0.0.0',
830
  max_file_size="5mb",
 
831
  )
 
1
  import uuid
2
  import zipfile
3
  from datetime import datetime
 
 
 
4
  from pathlib import Path
5
  from time import sleep, time
6
 
 
7
  import torch
 
8
  from email_validator import validate_email, EmailNotValidError
9
  from Bio import SeqIO
10
  import gradio as gr
 
13
  import pandas as pd
14
  from rdkit import Chem
15
  from rdkit.Chem import PandasTools
 
16
 
17
  from inference import (read_fragment_library, process_fragment_library, extract_pockets,
18
  dock_fragments, generate_linkers, select_fragment_pairs)
19
+ from app import static, fn, db
20
 
21
 
22
  gr.set_static_paths(paths=["data/", "results/"])
 
59
  )
60
  else:
61
  raise gr.Error('Current supported fragment library formats only include CSV and SDF files.')
62
+ fn.validate_columns(df, ['X1'])
63
  return df
64
 
65
 
 
505
  result_protein_file = gr.File(visible=False, interactive=False)
506
  with gr.Column(variant='panel'):
507
  with gr.Row():
508
+ scores = gr.CheckboxGroup(list(fn.SCORE_MAP.keys()), label='Compound Scores')
509
+ filters = gr.CheckboxGroup(list(fn.FILTER_MAP.keys()), label='Compound Filters')
510
  with gr.Row():
511
  prop_clr_btn = gr.ClearButton(value='Clear Properties', interactive=False)
512
  prop_calc_btn = gr.Button(value='Calculate Properties', interactive=False)
 
586
  }
587
  elif filepath.suffix == '.fasta':
588
  seq = next(SeqIO.parse(file, 'fasta')).seq
589
+ filepath = fn.pdb_query(seq, method='FASTA Sequence')
590
  return {
591
  input_prot_file: gr.File(str(filepath), visible=True),
592
  prot_query_input: seq,
 
600
  # outputs=[prot_file],
601
  # )
602
  # prot_file.change(
603
+ # fn=lambda file: gr.HTML(fn.create_complex_view_html(file), visible=True),
604
  # inputs=[prot_file],
605
  # outputs=[input_prot_view],
606
  # )
 
612
  )
613
 
614
  prot_query_btn.click(
615
+ fn=fn.pdb_query,
616
  inputs=[prot_query_input, prot_query_dropdown],
617
  outputs=[input_prot_file],
618
  )
 
634
  # pocket_extract_btn.click(
635
  # fn=lambda: gr.Info('Extracting pocket...'),
636
  # ).then(
637
+ # fn=fn.extract_pockets_and_update_view,
638
  # js=static.RETURN_LIGAND_SELECTION_JS,
639
  # inputs=[prot_file, selected_ligand],
640
  # outputs=[input_prot_view, pocket_path_dict, selected_ligand, selected_pocket],
 
753
  try:
754
  for filter_name in filter_list:
755
  mod_df[filter_name] = mod_df['Compound'].parallel_apply(
756
+ lambda x: fn.FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
757
 
758
  for score_name in score_list:
759
  mod_df[score_name] = mod_df['Compound'].parallel_apply(
760
+ lambda x: fn.SCORE_MAP[score_name](x) if not pd.isna(x) else x)
761
 
762
  return {result_table_mod_df: mod_df}
763
 
 
778
  outputs=[result_protein_file],
779
  )
780
  result_table_mod_df.change(
781
+ fn=fn.create_result_table_html,
782
  inputs=[result_table_mod_df],
783
  outputs=[result_table_view]
784
  ).success(
 
822
  demo.launch(
823
  server_name='0.0.0.0',
824
  max_file_size="5mb",
825
+ ssr_mode=False
826
  )
app/static.py CHANGED
@@ -788,6 +788,7 @@ SETUP_JS = """
788
  scripts.forEach((script) => {
789
  const scriptElement = document.createElement("script");
790
  scriptElement.src = script;
 
791
  document.head.appendChild(scriptElement);
792
  });
793
 
 
788
  scripts.forEach((script) => {
789
  const scriptElement = document.createElement("script");
790
  scriptElement.src = script;
791
+ scriptElement.async = true;
792
  document.head.appendChild(scriptElement);
793
  });
794
 
compute_metrics.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import csv
4
+ import numpy as np
5
+ import pandas as pd
6
+ import sys
7
+
8
+ from networkx.algorithms import isomorphism
9
+ from rdkit import Chem
10
+ from rdkit.Chem import MolStandardize, QED, rdMolAlign, rdMolDescriptors
11
+ from src.delinker_utils import calc_SC_RDKit, frag_utils, sascorer
12
+ from src.utils import disable_rdkit_logging
13
+ from tqdm import tqdm
14
+
15
+ from pdb import set_trace
16
+
17
+
18
+ disable_rdkit_logging()
19
+
20
+ if len(sys.argv) != 9:
21
+ print("Not provided all arguments")
22
+ quit()
23
+
24
+ data_set = sys.argv[1] # Options: ZINC, CASF
25
+ gen_smi_file = sys.argv[2] # Path to generated molecules
26
+ train_set_path = sys.argv[3] # Path to training set
27
+ n_cores = int(sys.argv[4]) # Number of cores to use
28
+ verbose = bool(sys.argv[5]) # Output results
29
+ if sys.argv[6] == "None":
30
+ restrict = None
31
+ else:
32
+ restrict = int(sys.argv[6]) # Set to None if don't want to restrict
33
+ pains_smarts_loc = sys.argv[7] # Path to PAINS SMARTS
34
+ method = sys.argv[8]
35
+ assert method in ['diffusion', '3dlinker', 'delinker']
36
+
37
+ if verbose:
38
+ print("##### Start Settings #####")
39
+ print("Data set:", data_set)
40
+ print("Generated smiles file:", gen_smi_file)
41
+ print("Training set:", train_set_path)
42
+ print("Number of cores:", n_cores)
43
+ print("Verbose:", verbose)
44
+ print("Restrict data:", restrict)
45
+ print("PAINS SMARTS location:", pains_smarts_loc)
46
+ print("##### End Settings #####")
47
+
48
+
49
+ # Load molecules
50
+ # FORMAT: (Starting fragments (SMILES), Original molecule (SMILES), Generated molecule (SMILES), Generated linker)
51
+ data = []
52
+ with open(gen_smi_file, 'r') as f:
53
+ for line in tqdm(f.readlines()):
54
+ parts = line.strip().split(' ')
55
+ data.append({
56
+ 'fragments': parts[0],
57
+ 'true_molecule': parts[1],
58
+ 'pred_molecule': parts[2],
59
+ 'pred_linker': parts[3] if len(parts) > 3 else '',
60
+ })
61
+
62
+ if restrict is not None:
63
+ data = data[:restrict]
64
+
65
+
66
+ summary = {}
67
+
68
+ # -------------- Validity -------------- #
69
+
70
+ def is_valid(pred_mol_smiles, frag_smiles):
71
+ pred_mol = Chem.MolFromSmiles(pred_mol_smiles)
72
+ frag = Chem.MolFromSmiles(frag_smiles)
73
+ if frag is None:
74
+ return False
75
+ if pred_mol is None:
76
+ return False
77
+ try:
78
+ Chem.SanitizeMol(pred_mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
79
+ except Exception:
80
+ return False
81
+ if len(pred_mol.GetSubstructMatch(frag)) != frag.GetNumAtoms():
82
+ return False
83
+ return True
84
+
85
+
86
+ valid_cnt = 0
87
+ total_cnt = 0
88
+ for obj in tqdm(data):
89
+ valid = is_valid(obj['pred_molecule'], obj['fragments'])
90
+ obj['valid'] = valid
91
+ valid_cnt += valid
92
+ total_cnt += 1
93
+
94
+ validity = valid_cnt / total_cnt * 100
95
+ print(f'Validity: {validity:.3f}%')
96
+ summary['validity'] = validity
97
+
98
+ # ----------------- QED ------------------ #
99
+
100
+ qed_values = []
101
+ for obj in tqdm(data):
102
+ if not obj['valid']:
103
+ obj['qed'] = None
104
+ continue
105
+
106
+ qed = QED.qed(Chem.MolFromSmiles(obj['pred_molecule']))
107
+ obj['qed'] = qed
108
+ qed_values.append(qed)
109
+
110
+ print(f'Mean QED: {np.mean(qed_values):.3f}')
111
+ summary['qed'] = np.mean(qed_values)
112
+
113
+ # ----------------- SA ------------------ #
114
+
115
+ sa_values = []
116
+ for obj in tqdm(data):
117
+ if not obj['valid']:
118
+ obj['sa'] = None
119
+ continue
120
+
121
+ sa = sascorer.calculateScore(Chem.MolFromSmiles(obj['pred_molecule']))
122
+ obj['sa'] = sa
123
+ sa_values.append(sa)
124
+
125
+ print(f'Mean SA: {np.mean(sa_values):.3f}')
126
+ summary['sa'] = np.mean(sa_values)
127
+
128
+ # ----------------- Number of Rings ------------------ #
129
+
130
+ rings_n_values = []
131
+ for obj in tqdm(data):
132
+ if not obj['valid']:
133
+ obj['rings_n'] = None
134
+ continue
135
+
136
+ try:
137
+ rings_n = rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(obj['pred_linker']))
138
+ except:
139
+ continue
140
+
141
+ obj['rings_n'] = rings_n
142
+ rings_n_values.append(rings_n)
143
+
144
+ print(f'Mean Number of Rings: {np.mean(rings_n_values):.3f}')
145
+ summary['rings_n'] = np.mean(rings_n_values)
146
+
147
+ # -------------- Uniqueness -------------- #
148
+
149
+ true2samples = dict()
150
+ for obj in tqdm(data):
151
+ if not obj['valid']:
152
+ continue
153
+
154
+ true_mol = obj['true_molecule']
155
+ true_frags = obj['fragments']
156
+ key = f'{true_mol}_{true_frags}'
157
+ true2samples.setdefault(key, []).append(obj['pred_molecule'])
158
+
159
+ unique_cnt = 0
160
+ total_cnt = 0
161
+ for samples in tqdm(true2samples.values()):
162
+ unique_cnt += len(set(samples))
163
+ total_cnt += len(samples)
164
+
165
+ uniqueness = unique_cnt / total_cnt * 100
166
+ print(f'Uniqueness: {uniqueness:.3f}%')
167
+ summary['uniqueness'] = uniqueness
168
+
169
+ # ----------------- Novelty ---------------- #
170
+
171
+ linkers_train = set()
172
+ with open(train_set_path, 'r') as f:
173
+ for line in f:
174
+ linkers_train.add(line.strip())
175
+
176
+ novel_cnt = 0
177
+ total_cnt = 0
178
+ for obj in tqdm(data):
179
+ if not obj['valid']:
180
+ obj['pred_linker_clean'] = None
181
+ obj['novel'] = False
182
+ continue
183
+
184
+ try:
185
+ linker = Chem.RemoveStereochemistry(obj['pred_linker'])
186
+ linker = MolStandardize.canonicalize_tautomer_smiles(Chem.MolToSmiles(linker))
187
+ except Exception:
188
+ linker = obj['pred_linker']
189
+
190
+ novel = linker not in linkers_train
191
+ obj['pred_linker_clean'] = linker
192
+ obj['novel'] = novel
193
+ novel_cnt += novel
194
+ total_cnt += 1
195
+
196
+ novelty = novel_cnt / total_cnt * 100
197
+ print(f'Novelty: {novelty:.3f}%')
198
+ summary['novelty'] = novelty
199
+
200
+ # ----------------- Recovery ---------------- #
201
+
202
+ recovered_inputs = set()
203
+ all_inputs = set()
204
+ for obj in tqdm(data):
205
+ if not obj['valid']:
206
+ obj['recovered'] = False
207
+ continue
208
+
209
+ key = obj['true_molecule'] + '_' + obj['fragments']
210
+
211
+ try:
212
+ true_mol = Chem.MolFromSmiles(obj['true_molecule'])
213
+ Chem.RemoveStereochemistry(true_mol)
214
+ true_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(true_mol))
215
+ except:
216
+ true_mol = Chem.MolFromSmiles(obj['true_molecule'], sanitize=False)
217
+ Chem.RemoveStereochemistry(true_mol)
218
+ true_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(true_mol, sanitize=False))
219
+
220
+ pred_mol = Chem.MolFromSmiles(obj['pred_molecule'])
221
+ Chem.RemoveStereochemistry(pred_mol)
222
+ pred_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(pred_mol))
223
+
224
+ recovered = true_mol_smi == pred_mol_smi
225
+ obj['recovered'] = recovered
226
+ if recovered:
227
+ recovered_inputs.add(key)
228
+ all_inputs.add(key)
229
+
230
+ recovery = len(recovered_inputs) / len(all_inputs) * 100
231
+ print(f'Recovery: {recovery:.3f}%')
232
+ summary['recovery'] = recovery
233
+
234
+ # ----------------- PAINS Filter ---------------- #
235
+
236
+
237
+ def check_pains(mol, pains):
238
+ for pain in pains:
239
+ if mol.HasSubstructMatch(pain):
240
+ return False
241
+ return True
242
+
243
+
244
+ with open(pains_smarts_loc, 'r') as f:
245
+ pains_smarts = [Chem.MolFromSmarts(line[0], mergeHs=True) for line in csv.reader(f)]
246
+ pains_smarts = set(pains_smarts)
247
+
248
+
249
+ passed_pains_cnt = 0
250
+ total_cnt = 0
251
+ for obj in tqdm(data):
252
+ if not obj['valid']:
253
+ obj['passed_pains'] = False
254
+ continue
255
+
256
+ pred_mol = Chem.MolFromSmiles(obj['pred_molecule'])
257
+ passed_pains = check_pains(pred_mol, pains_smarts)
258
+ obj['passed_pains'] = passed_pains
259
+ passed_pains_cnt += passed_pains
260
+ total_cnt += 1
261
+
262
+ pains_score = passed_pains_cnt / total_cnt * 100
263
+ print(f'Passed PAINS: {pains_score:.3f}%')
264
+ summary['pains'] = pains_score
265
+
266
+
267
+ # ----------------- RA Filter ---------------- #
268
+
269
+ def check_ring_filter(linker):
270
+ check = True
271
+ ssr = Chem.GetSymmSSSR(linker)
272
+ for ring in ssr:
273
+ for atom_idx in ring:
274
+ for bond in linker.GetAtomWithIdx(atom_idx).GetBonds():
275
+ if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring:
276
+ check = False
277
+ return check
278
+
279
+
280
+ passed_ring_filter_cnt = 0
281
+ total_cnt = 0
282
+ for obj in tqdm(data):
283
+ if not obj['valid']:
284
+ obj['passed_ring_filter'] = False
285
+ continue
286
+
287
+ pred_linker = Chem.MolFromSmiles(obj['pred_linker'], sanitize=False)
288
+ try:
289
+ passed_ring_filter = check_ring_filter(pred_linker)
290
+ except:
291
+ obj['passed_ring_filter'] = False
292
+ continue
293
+ obj['passed_ring_filter'] = passed_ring_filter
294
+ passed_ring_filter_cnt += passed_ring_filter
295
+ total_cnt += 1
296
+
297
+ ra_score = passed_ring_filter_cnt / total_cnt * 100
298
+ print(f'Passed Ring Filter: {ra_score:.3f}%')
299
+ summary['ra'] = ra_score
300
+
301
+ # ---------------------------- Saving -------------------------------- #
302
+
303
+ out_path = gen_smi_file[:-3] + 'csv'
304
+ table = pd.DataFrame(data)
305
+ table.to_csv(out_path, index=False)
306
+
307
+ summary_path = gen_smi_file[:-4] + '_summary.csv'
308
+ summary_table = pd.DataFrame([summary])
309
+ summary_table.to_csv(summary_path, index=False)
310
+
311
+ # ----------------------- RMSD --------------------- #
312
+
313
+ sdf_path = gen_smi_file[:-3] + 'sdf'
314
+ pred_mol_3d = Chem.SDMolSupplier(sdf_path)
315
+
316
+ if method == 'diffusion' and data_set == 'ZINC':
317
+ # Use SMILES of test set generated for molecules processed by OpenBabel
318
+ # (for consistency with other evaluation metrics)
319
+ # Because SMILES produced by our model are also based on OpenBabel
320
+ true_smi_path = 'datasets/zinc_final_test_smiles.smi'
321
+ true_mol_path = 'datasets/zinc_final_test_molecules.sdf'
322
+ true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
323
+ true_mol_3d = Chem.SDMolSupplier(true_mol_path)
324
+ true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
325
+ elif method == 'diffusion' and data_set == 'CASF':
326
+ # Use SMILES of test set generated for molecules processed by OpenBabel
327
+ # (for consistency with other evaluation metrics)
328
+ # Because SMILES produced by our model are also based on OpenBabel
329
+ true_smi_path = 'datasets/casf_final_test_smiles.smi'
330
+ true_mol_path = 'datasets/casf_final_test_molecules.sdf'
331
+ true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
332
+ true_mol_3d = Chem.SDMolSupplier(true_mol_path)
333
+ true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
334
+ elif method == 'diffusion' and data_set == 'GEOM':
335
+ # Use SMILES of test set generated for molecules processed by OpenBabel
336
+ # (for consistency with other evaluation metrics)
337
+ # Because SMILES produced by our model are also based on OpenBabel
338
+ true_smi_path = 'datasets/geom_multifrag_test_smiles.smi'
339
+ true_mol_path = 'datasets/geom_multifrag_test_molecules.sdf'
340
+ true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
341
+ true_mol_3d = Chem.SDMolSupplier(true_mol_path)
342
+ true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
343
+ elif method == 'diffusion' and data_set == 'MOAD':
344
+ # Use SMILES of test set generated for molecules processed by OpenBabel
345
+ # (for consistency with other evaluation metrics)
346
+ # Because SMILES produced by our model are also based on OpenBabel
347
+ true_smi_path = 'datasets/MOAD_test_smiles.smi'
348
+ true_mol_path = 'datasets/MOAD_test_molecules.sdf'
349
+ true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
350
+ true_mol_3d = Chem.SDMolSupplier(true_mol_path)
351
+ true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
352
+ else:
353
+ raise NotImplementedError
354
+
355
+
356
+ def find_exit(mol, num_frag):
357
+ neighbors = []
358
+ for atom_idx in range(num_frag, mol.GetNumAtoms()):
359
+ N = mol.GetAtoms()[atom_idx].GetNeighbors()
360
+ for n in N:
361
+ if n.GetIdx() < num_frag:
362
+ neighbors.append(n.GetIdx())
363
+ return neighbors
364
+
365
+
366
+ rmsd_list = []
367
+ for i, (obj, pred) in tqdm(enumerate(zip(data, pred_mol_3d)), total=len(data)):
368
+ obj['rmsd'] = None
369
+ if not obj['recovered']:
370
+ continue
371
+
372
+ true = true_smi2mol3d[obj['true_molecule']]
373
+
374
+ Chem.RemoveStereochemistry(true)
375
+ true = Chem.RemoveHs(true)
376
+
377
+ Chem.RemoveStereochemistry(pred)
378
+ pred = Chem.RemoveHs(pred)
379
+
380
+ G1 = frag_utils.topology_from_rdkit(pred)
381
+ G2 = frag_utils.topology_from_rdkit(true)
382
+ GM = isomorphism.GraphMatcher(G1, G2)
383
+ flag = GM.is_isomorphic()
384
+ frag_size = Chem.MolFromSmiles(obj['fragments']).GetNumAtoms()
385
+ # exits = find_exit(pred, frag_size)
386
+
387
+ # if flag and len(exits) == 2:
388
+ if flag:
389
+ error = Chem.rdMolAlign.GetBestRMS(pred, true)
390
+ # try:
391
+ # error = Chem.rdMolAlign.GetBestRMS(pred, true)
392
+ # except:
393
+ # set_trace()
394
+ num_linker = pred.GetNumAtoms() - frag_size
395
+ num_atoms = pred.GetNumAtoms()
396
+ error *= np.sqrt(num_atoms / num_linker) # only count rmsd on linker
397
+ rmsd_list.append(error)
398
+ obj['rmsd'] = error
399
+
400
+ rmsd_score = np.mean(rmsd_list)
401
+ print(f'Mean RMSD: {rmsd_score:.3f}')
402
+ summary['rmsd'] = rmsd_score
403
+
404
+ # ----------------------------- SC-RDKit -------------------------- #
405
+
406
+
407
+ def calc_sc_rdkit_full_mol(gen_mol, ref_mol):
408
+ try:
409
+ _ = rdMolAlign.GetO3A(gen_mol, ref_mol).Align()
410
+ sc_score = calc_SC_RDKit.calc_SC_RDKit_score(gen_mol, ref_mol)
411
+ return sc_score
412
+ except:
413
+ return -0.5
414
+
415
+
416
+ sc_rdkit_list = []
417
+ for i, (obj, pred) in tqdm(enumerate(zip(data, pred_mol_3d)), total=len(data)):
418
+ obj['sc_rdkit'] = None
419
+ if not obj['valid']:
420
+ continue
421
+
422
+ true = true_smi2mol3d[obj['true_molecule']]
423
+ score = calc_sc_rdkit_full_mol(pred, true)
424
+ sc_rdkit_list.append(score)
425
+ obj['sc_rdkit'] = score
426
+
427
+ sc_rdkit_list = np.array(sc_rdkit_list)
428
+ sc_rdkit_7 = (sc_rdkit_list > 0.7).sum() / len(sc_rdkit_list) * 100
429
+ sc_rdkit_8 = (sc_rdkit_list > 0.8).sum() / len(sc_rdkit_list) * 100
430
+ sc_rdkit_9 = (sc_rdkit_list > 0.9).sum() / len(sc_rdkit_list) * 100
431
+ sc_rdkit_mean = np.mean(sc_rdkit_list)
432
+
433
+ print(f'SC_RDKit > 0.7: {sc_rdkit_7:3f}%')
434
+ print(f'SC_RDKit > 0.8: {sc_rdkit_8:3f}%')
435
+ print(f'SC_RDKit > 0.9: {sc_rdkit_9:3f}%')
436
+ print(f'Mean SC_RDKit: {sc_rdkit_mean}')
437
+
438
+ summary['sc_rdkit_7'] = sc_rdkit_7
439
+ summary['sc_rdkit_8'] = sc_rdkit_8
440
+ summary['sc_rdkit_9'] = sc_rdkit_9
441
+ summary['sc_rdkit_mean'] = sc_rdkit_mean
442
+
443
+ # ---------------------------- Saving -------------------------------- #
444
+
445
+ out_path = gen_smi_file[:-3] + 'csv'
446
+ table = pd.DataFrame(data)
447
+ table.to_csv(out_path, index=False)
448
+
449
+ summary_path = gen_smi_file[:-4] + '_summary.csv'
450
+ summary_table = pd.DataFrame([summary])
451
+ summary_table.to_csv(summary_path, index=False)
confidence/confidence_train.py DELETED
@@ -1,320 +0,0 @@
1
- import gc
2
- import math
3
- import os
4
-
5
- import shutil
6
-
7
- from argparse import Namespace, ArgumentParser, FileType
8
- import torch.nn.functional as F
9
-
10
- import wandb
11
- import torch
12
- from sklearn.metrics import roc_auc_score
13
- from torch_geometric.loader import DataListLoader, DataLoader
14
- from tqdm import tqdm
15
-
16
- from confidence.dataset import ConfidenceDataset
17
- from utils.training import AverageMeter
18
-
19
- torch.multiprocessing.set_sharing_strategy('file_system')
20
-
21
- import yaml
22
- from utils.utils import save_yaml_file, get_optimizer_and_scheduler, get_model
23
-
24
-
25
- parser = ArgumentParser()
26
- parser.add_argument('--config', type=FileType(mode='r'), default=None)
27
- parser.add_argument('--original_model_dir', type=str, default='workdir', help='Path to folder with trained model and hyperparameters')
28
- parser.add_argument('--restart_dir', type=str, default=None, help='')
29
- parser.add_argument('--use_original_model_cache', action='store_true', default=False, help='If this is true, the same dataset as in the original model will be used. Otherwise, the dataset parameters are used.')
30
- parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed/', help='Folder containing original structures')
31
- parser.add_argument('--ckpt', type=str, default='best_model.pt', help='Checkpoint to use inside the folder')
32
- parser.add_argument('--model_save_frequency', type=int, default=0, help='Frequency with which to save the last model. If 0, then only the early stopping criterion best model is saved and overwritten.')
33
- parser.add_argument('--best_model_save_frequency', type=int, default=0, help='Frequency with which to save the best model. If 0, then only the early stopping criterion best model is saved and overwritten.')
34
- parser.add_argument('--run_name', type=str, default='test_confidence', help='')
35
- parser.add_argument('--project', type=str, default='diffdock_confidence', help='')
36
- parser.add_argument('--split_train', type=str, default='data/splits/timesplit_no_lig_overlap_train', help='Path of file defining the split')
37
- parser.add_argument('--split_val', type=str, default='data/splits/timesplit_no_lig_overlap_val', help='Path of file defining the split')
38
- parser.add_argument('--split_test', type=str, default='data/splits/timesplit_test', help='Path of file defining the split')
39
-
40
- # Inference parameters for creating the positions and rmsds that the confidence predictor will be trained on.
41
- parser.add_argument('--cache_path', type=str, default='data/cacheNew', help='Folder from where to load/restore cached dataset')
42
- parser.add_argument('--cache_ids_to_combine', nargs='+', type=str, default=None, help='RMSD value below which a prediction is considered a postitive. This can also be multiple cutoffs.')
43
- parser.add_argument('--cache_creation_id', type=int, default=None, help='number of times that inference is run on the full dataset before concatenating it and coming up with the full confidence dataset')
44
- parser.add_argument('--wandb', action='store_true', default=False, help='')
45
- parser.add_argument('--inference_steps', type=int, default=2, help='Number of denoising steps')
46
- parser.add_argument('--samples_per_complex', type=int, default=3, help='')
47
- parser.add_argument('--balance', action='store_true', default=False, help='If this is true than we do not force the samples seen during training to be the same amount of negatives as positives')
48
- parser.add_argument('--rmsd_prediction', action='store_true', default=False, help='')
49
- parser.add_argument('--rmsd_classification_cutoff', nargs='+', type=float, default=2, help='RMSD value below which a prediction is considered a postitive. This can also be multiple cutoffs.')
50
-
51
- parser.add_argument('--log_dir', type=str, default='workdir', help='')
52
- parser.add_argument('--main_metric', type=str, default='accuracy', help='Metric to track for early stopping. Mostly [loss, accuracy, ROC AUC]')
53
- parser.add_argument('--main_metric_goal', type=str, default='max', help='Can be [min, max]')
54
- parser.add_argument('--transfer_weights', action='store_true', default=False, help='')
55
- parser.add_argument('--batch_size', type=int, default=5, help='')
56
- parser.add_argument('--lr', type=float, default=1e-3, help='')
57
- parser.add_argument('--w_decay', type=float, default=0.0, help='')
58
- parser.add_argument('--scheduler', type=str, default='plateau', help='')
59
- parser.add_argument('--scheduler_patience', type=int, default=20, help='')
60
- parser.add_argument('--n_epochs', type=int, default=5, help='')
61
-
62
- # Dataset
63
- parser.add_argument('--limit_complexes', type=int, default=0, help='')
64
- parser.add_argument('--all_atoms', action='store_true', default=True, help='')
65
- parser.add_argument('--multiplicity', type=int, default=1, help='')
66
- parser.add_argument('--chain_cutoff', type=float, default=10, help='')
67
- parser.add_argument('--receptor_radius', type=float, default=30, help='')
68
- parser.add_argument('--c_alpha_max_neighbors', type=int, default=10, help='')
69
- parser.add_argument('--atom_radius', type=float, default=5, help='')
70
- parser.add_argument('--atom_max_neighbors', type=int, default=8, help='')
71
- parser.add_argument('--matching_popsize', type=int, default=20, help='')
72
- parser.add_argument('--matching_maxiter', type=int, default=20, help='')
73
- parser.add_argument('--max_lig_size', type=int, default=None, help='Maximum number of heavy atoms')
74
- parser.add_argument('--remove_hs', action='store_true', default=False, help='remove Hs')
75
- parser.add_argument('--num_conformers', type=int, default=1, help='')
76
- parser.add_argument('--esm_embeddings_path', type=str, default=None,help='If this is set then the LM embeddings at that path will be used for the receptor features')
77
- parser.add_argument('--no_torsion', action='store_true', default=False, help='')
78
-
79
- # Model
80
- parser.add_argument('--num_conv_layers', type=int, default=2, help='Number of interaction layers')
81
- parser.add_argument('--max_radius', type=float, default=5.0, help='Radius cutoff for geometric graph')
82
- parser.add_argument('--scale_by_sigma', action='store_true', default=True, help='Whether to normalise the score')
83
- parser.add_argument('--ns', type=int, default=16, help='Number of hidden features per node of order 0')
84
- parser.add_argument('--nv', type=int, default=4, help='Number of hidden features per node of order >0')
85
- parser.add_argument('--distance_embed_dim', type=int, default=32, help='')
86
- parser.add_argument('--cross_distance_embed_dim', type=int, default=32, help='')
87
- parser.add_argument('--no_batch_norm', action='store_true', default=False, help='If set, it removes the batch norm')
88
- parser.add_argument('--use_second_order_repr', action='store_true', default=False, help='Whether to use only up to first order representations or also second')
89
- parser.add_argument('--cross_max_distance', type=float, default=80, help='')
90
- parser.add_argument('--dynamic_max_cross', action='store_true', default=False, help='')
91
- parser.add_argument('--dropout', type=float, default=0.0, help='MLP dropout')
92
- parser.add_argument('--embedding_type', type=str, default="sinusoidal", help='')
93
- parser.add_argument('--sigma_embed_dim', type=int, default=32, help='')
94
- parser.add_argument('--embedding_scale', type=int, default=10000, help='')
95
- parser.add_argument('--confidence_no_batchnorm', action='store_true', default=False, help='')
96
- parser.add_argument('--confidence_dropout', type=float, default=0.0, help='MLP dropout in confidence readout')
97
-
98
- args = parser.parse_args()
99
- if args.config:
100
- config_dict = yaml.load(args.config, Loader=yaml.FullLoader)
101
- arg_dict = args.__dict__
102
- for key, value in config_dict.items():
103
- if isinstance(value, list):
104
- for v in value:
105
- arg_dict[key].append(v)
106
- else:
107
- arg_dict[key] = value
108
- args.config = args.config.name
109
- assert(args.main_metric_goal == 'max' or args.main_metric_goal == 'min')
110
-
111
- def train_epoch(model, loader, optimizer, rmsd_prediction):
112
- model.train()
113
- meter = AverageMeter(['confidence_loss'])
114
-
115
- for data in tqdm(loader, total=len(loader)):
116
- if device.type == 'cuda' and len(data) % torch.cuda.device_count() == 1 or device.type == 'cpu' and data.num_graphs == 1:
117
- print("Skipping batch of size 1 since otherwise batchnorm would not work.")
118
- optimizer.zero_grad()
119
- try:
120
- pred = model(data)
121
- if rmsd_prediction:
122
- labels = torch.cat([graph.rmsd for graph in data]).to(device) if isinstance(data, list) else data.rmsd
123
- confidence_loss = F.mse_loss(pred, labels)
124
- else:
125
- if isinstance(args.rmsd_classification_cutoff, list):
126
- labels = torch.cat([graph.y_binned for graph in data]).to(device) if isinstance(data, list) else data.y_binned
127
- confidence_loss = F.cross_entropy(pred, labels)
128
- else:
129
- labels = torch.cat([graph.y for graph in data]).to(device) if isinstance(data, list) else data.y
130
- confidence_loss = F.binary_cross_entropy_with_logits(pred, labels)
131
- confidence_loss.backward()
132
- optimizer.step()
133
- meter.add([confidence_loss.cpu().detach()])
134
- except RuntimeError as e:
135
- if 'out of memory' in str(e):
136
- print('| WARNING: ran out of memory, skipping batch')
137
- for p in model.parameters():
138
- if p.grad is not None:
139
- del p.grad # free some memory
140
- torch.cuda.empty_cache()
141
- gc.collect()
142
- continue
143
- else:
144
- raise e
145
-
146
- return meter.summary()
147
-
148
- def test_epoch(model, loader, rmsd_prediction):
149
- model.eval()
150
- meter = AverageMeter(['loss'], unpooled_metrics=True) if rmsd_prediction else AverageMeter(['confidence_loss', 'accuracy', 'ROC AUC'], unpooled_metrics=True)
151
- all_labels = []
152
- for data in tqdm(loader, total=len(loader)):
153
- try:
154
- with torch.no_grad():
155
- pred = model(data)
156
- affinity_loss = torch.tensor(0.0, dtype=torch.float, device=pred[0].device)
157
- accuracy = torch.tensor(0.0, dtype=torch.float, device=pred[0].device)
158
- if rmsd_prediction:
159
- labels = torch.cat([graph.rmsd for graph in data]).to(device) if isinstance(data, list) else data.rmsd
160
- confidence_loss = F.mse_loss(pred, labels)
161
- meter.add([confidence_loss.cpu().detach()])
162
- else:
163
- if isinstance(args.rmsd_classification_cutoff, list):
164
- labels = torch.cat([graph.y_binned for graph in data]).to(device) if isinstance(data,list) else data.y_binned
165
- confidence_loss = F.cross_entropy(pred, labels)
166
- else:
167
- labels = torch.cat([graph.y for graph in data]).to(device) if isinstance(data, list) else data.y
168
- confidence_loss = F.binary_cross_entropy_with_logits(pred, labels)
169
- accuracy = torch.mean((labels == (pred > 0).float()).float())
170
- try:
171
- roc_auc = roc_auc_score(labels.detach().cpu().numpy(), pred.detach().cpu().numpy())
172
- except ValueError as e:
173
- if 'Only one class present in y_true. ROC AUC score is not defined in that case.' in str(e):
174
- roc_auc = 0
175
- else:
176
- raise e
177
- meter.add([confidence_loss.cpu().detach(), accuracy.cpu().detach(), torch.tensor(roc_auc)])
178
- all_labels.append(labels)
179
-
180
- except RuntimeError as e:
181
- if 'out of memory' in str(e):
182
- print('| WARNING: ran out of memory, skipping batch')
183
- for p in model.parameters():
184
- if p.grad is not None:
185
- del p.grad # free some memory
186
- torch.cuda.empty_cache()
187
- continue
188
- else:
189
- raise e
190
-
191
- all_labels = torch.cat(all_labels)
192
-
193
- if rmsd_prediction:
194
- baseline_metric = ((all_labels - all_labels.mean()).abs()).mean()
195
- else:
196
- baseline_metric = all_labels.sum() / len(all_labels)
197
- results = meter.summary()
198
- results.update({'baseline_metric': baseline_metric})
199
- return meter.summary(), baseline_metric
200
-
201
-
202
- def train(args, model, optimizer, scheduler, train_loader, val_loader, run_dir):
203
- best_val_metric = math.inf if args.main_metric_goal == 'min' else 0
204
- best_epoch = 0
205
-
206
- print("Starting training...")
207
- for epoch in range(args.n_epochs):
208
- logs = {}
209
- train_metrics = train_epoch(model, train_loader, optimizer, args.rmsd_prediction)
210
- print("Epoch {}: Training loss {:.4f}".format(epoch, train_metrics['confidence_loss']))
211
-
212
- val_metrics, baseline_metric = test_epoch(model, val_loader, args.rmsd_prediction)
213
- if args.rmsd_prediction:
214
- print("Epoch {}: Validation loss {:.4f}".format(epoch, val_metrics['confidence_loss']))
215
- else:
216
- print("Epoch {}: Validation loss {:.4f} accuracy {:.4f}".format(epoch, val_metrics['confidence_loss'], val_metrics['accuracy']))
217
-
218
- if args.wandb:
219
- logs.update({'valinf_' + k: v for k, v in val_metrics.items()}, step=epoch + 1)
220
- logs.update({'train_' + k: v for k, v in train_metrics.items()}, step=epoch + 1)
221
- logs.update({'mean_rmsd' if args.rmsd_prediction else 'fraction_positives': baseline_metric,
222
- 'current_lr': optimizer.param_groups[0]['lr']})
223
- wandb.log(logs, step=epoch + 1)
224
-
225
- if scheduler:
226
- scheduler.step(val_metrics[args.main_metric])
227
-
228
- state_dict = model.module.state_dict() if device.type == 'cuda' else model.state_dict()
229
-
230
- if args.main_metric_goal == 'min' and val_metrics[args.main_metric] < best_val_metric or \
231
- args.main_metric_goal == 'max' and val_metrics[args.main_metric] > best_val_metric:
232
- best_val_metric = val_metrics[args.main_metric]
233
- best_epoch = epoch
234
- torch.save(state_dict, os.path.join(run_dir, 'best_model.pt'))
235
- if args.model_save_frequency > 0 and (epoch + 1) % args.model_save_frequency == 0:
236
- torch.save(state_dict, os.path.join(run_dir, f'model_epoch{epoch+1}.pt'))
237
- if args.best_model_save_frequency > 0 and (epoch + 1) % args.best_model_save_frequency == 0:
238
- shutil.copyfile(os.path.join(run_dir, 'best_model.pt'), os.path.join(run_dir, f'best_model_epoch{epoch+1}.pt'))
239
-
240
- torch.save({
241
- 'epoch': epoch,
242
- 'model': state_dict,
243
- 'optimizer': optimizer.state_dict(),
244
- }, os.path.join(run_dir, 'last_model.pt'))
245
-
246
- print("Best Validation accuracy {} on Epoch {}".format(best_val_metric, best_epoch))
247
-
248
-
249
- def construct_loader_confidence(args, device):
250
- common_args = {'cache_path': args.cache_path, 'original_model_dir': args.original_model_dir, 'device': device,
251
- 'inference_steps': args.inference_steps, 'samples_per_complex': args.samples_per_complex,
252
- 'limit_complexes': args.limit_complexes, 'all_atoms': args.all_atoms, 'balance': args.balance,
253
- 'rmsd_classification_cutoff': args.rmsd_classification_cutoff, 'use_original_model_cache': args.use_original_model_cache,
254
- 'cache_creation_id': args.cache_creation_id, "cache_ids_to_combine": args.cache_ids_to_combine,
255
- "model_ckpt": args.ckpt}
256
- loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
257
-
258
- exception_flag = False
259
- try:
260
- train_dataset = ConfidenceDataset(split="train", args=args, **common_args)
261
- train_loader = loader_class(dataset=train_dataset, batch_size=args.batch_size, shuffle=True)
262
- except Exception as e:
263
- if 'The generated ligand positions with cache_id do not exist:' in str(e):
264
- print("HAPPENING | Encountered the following exception when loading the confidence train dataset:")
265
- print(str(e))
266
- print("HAPPENING | We are still continuing because we want to try to generate the validation dataset if it has not been created yet:")
267
- exception_flag = True
268
- else: raise e
269
-
270
- val_dataset = ConfidenceDataset(split="val", args=args, **common_args)
271
- val_loader = loader_class(dataset=val_dataset, batch_size=args.batch_size, shuffle=True)
272
-
273
- if exception_flag: raise Exception('We encountered the exception during train dataset loading: ', e)
274
- return train_loader, val_loader
275
-
276
-
277
- if __name__ == '__main__':
278
- device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
279
- with open(f'{args.original_model_dir}/model_parameters.yml') as f:
280
- score_model_args = Namespace(**yaml.full_load(f))
281
-
282
- # construct loader
283
- train_loader, val_loader = construct_loader_confidence(args, device)
284
- model = get_model(score_model_args if args.transfer_weights else args, device, t_to_sigma=None, confidence_mode=True)
285
- optimizer, scheduler = get_optimizer_and_scheduler(args, model, scheduler_mode=args.main_metric_goal)
286
-
287
- if args.transfer_weights:
288
- print("HAPPENING | Transferring weights from original_model_dir to the new model after using original_model_dir's arguments to construct the new model.")
289
- checkpoint = torch.load(os.path.join(args.original_model_dir,args.ckpt), map_location=device)
290
- model_state_dict = model.state_dict()
291
- transfer_weights_dict = {k: v for k, v in checkpoint.items() if k in list(model_state_dict.keys())}
292
- model_state_dict.update(transfer_weights_dict) # update the layers with the pretrained weights
293
- model.load_state_dict(model_state_dict)
294
-
295
- elif args.restart_dir:
296
- dict = torch.load(f'{args.restart_dir}/last_model.pt', map_location=torch.device('cpu'))
297
- model.module.load_state_dict(dict['model'], strict=True)
298
- optimizer.load_state_dict(dict['optimizer'])
299
- print("Restarting from epoch", dict['epoch'])
300
-
301
- numel = sum([p.numel() for p in model.parameters()])
302
- print('Model with', numel, 'parameters')
303
-
304
- if args.wandb:
305
- wandb.init(
306
- entity='entity',
307
- settings=wandb.Settings(start_method="fork"),
308
- project=args.project,
309
- name=args.run_name,
310
- config=args
311
- )
312
- wandb.log({'numel': numel})
313
-
314
- # record parameters
315
- run_dir = os.path.join(args.log_dir, args.run_name)
316
- yaml_file_name = os.path.join(run_dir, 'model_parameters.yml')
317
- save_yaml_file(yaml_file_name, args.__dict__)
318
- args.device = device
319
-
320
- train(args, model, optimizer, scheduler, train_loader, val_loader, run_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
confidence/dataset.py DELETED
@@ -1,276 +0,0 @@
1
- import itertools
2
- import math
3
- import os
4
- import pickle
5
- import random
6
- from argparse import Namespace
7
- from functools import partial
8
- import copy
9
-
10
- import numpy as np
11
- import pandas as pd
12
- import torch
13
- import yaml
14
- from torch_geometric.data import Dataset, Data
15
- from torch_geometric.loader import DataLoader
16
- from tqdm import tqdm
17
-
18
- from datasets.pdbbind import PDBBind
19
- from utils.diffusion_utils import get_t_schedule
20
- from utils.sampling import randomize_position, sampling
21
- from utils.utils import get_model
22
- from utils.diffusion_utils import t_to_sigma as t_to_sigma_compl
23
-
24
-
25
- class ListDataset(Dataset):
26
- def __init__(self, list):
27
- super().__init__()
28
- self.data_list = list
29
-
30
- def len(self) -> int:
31
- return len(self.data_list)
32
-
33
- def get(self, idx: int) -> Data:
34
- return self.data_list[idx]
35
-
36
- def get_cache_path(args, split):
37
- cache_path = args.cache_path
38
- if not args.no_torsion:
39
- cache_path += '_torsion'
40
- if args.all_atoms:
41
- cache_path += '_allatoms'
42
- split_path = args.split_train if split == 'train' else args.split_val
43
- cache_path = os.path.join(cache_path, f'limit{args.limit_complexes}_INDEX{os.path.splitext(os.path.basename(split_path))[0]}_maxLigSize{args.max_lig_size}_H{int(not args.remove_hs)}_recRad{args.receptor_radius}_recMax{args.c_alpha_max_neighbors}'
44
- + ('' if not args.all_atoms else f'_atomRad{args.atom_radius}_atomMax{args.atom_max_neighbors}')
45
- + ('' if args.no_torsion or args.num_conformers == 1 else
46
- f'_confs{args.num_conformers}')
47
- + ('' if args.esm_embeddings_path is None else f'_esmEmbeddings'))
48
- return cache_path
49
-
50
- def get_args_and_cache_path(original_model_dir, split):
51
- with open(f'{original_model_dir}/model_parameters.yml') as f:
52
- model_args = Namespace(**yaml.full_load(f))
53
- return model_args, get_cache_path(model_args,split)
54
-
55
-
56
-
57
- class ConfidenceDataset(Dataset):
58
- def __init__(self, cache_path, original_model_dir, split, device, limit_complexes,
59
- inference_steps, samples_per_complex, all_atoms,
60
- args, model_ckpt, balance=False, use_original_model_cache=True, rmsd_classification_cutoff=2,
61
- cache_ids_to_combine=None, cache_creation_id=None):
62
-
63
- super(ConfidenceDataset, self).__init__()
64
-
65
- self.device = device
66
- self.inference_steps = inference_steps
67
- self.limit_complexes = limit_complexes
68
- self.all_atoms = all_atoms
69
- self.original_model_dir = original_model_dir
70
- self.balance = balance
71
- self.use_original_model_cache = use_original_model_cache
72
- self.rmsd_classification_cutoff = rmsd_classification_cutoff
73
- self.cache_ids_to_combine = cache_ids_to_combine
74
- self.cache_creation_id = cache_creation_id
75
- self.samples_per_complex = samples_per_complex
76
- self.model_ckpt = model_ckpt
77
-
78
- self.original_model_args, original_model_cache = get_args_and_cache_path(original_model_dir, split)
79
- self.complex_graphs_cache = original_model_cache if self.use_original_model_cache else get_cache_path(args, split)
80
-
81
- # check if the docked positions have already been computed, if not run the preprocessing (docking every complex)
82
- self.full_cache_path = os.path.join(cache_path, f'model_{os.path.splitext(os.path.basename(original_model_dir))[0]}'
83
- f'_split_{split}_limit_{limit_complexes}')
84
-
85
- if (not os.path.exists(os.path.join(self.full_cache_path, "ligand_positions.pkl")) and self.cache_creation_id is None) or \
86
- (not os.path.exists(os.path.join(self.full_cache_path, f"ligand_positions_id{self.cache_creation_id}.pkl")) and self.cache_creation_id is not None):
87
- os.makedirs(self.full_cache_path, exist_ok=True)
88
- self.preprocessing(original_model_cache)
89
-
90
- # load the graphs that the confidence model will use
91
- print('Using the cached complex graphs of the original model args' if self.use_original_model_cache else 'Not using the cached complex graphs of the original model args. Instead the complex graphs are used that are at the location given by the dataset parameters given to confidence_train.py')
92
- print(self.complex_graphs_cache)
93
- if not os.path.exists(os.path.join(self.complex_graphs_cache, "heterographs.pkl")):
94
- print(f'HAPPENING | Complex graphs path does not exist yet: {os.path.join(self.complex_graphs_cache, "heterographs.pkl")}. For that reason, we are now creating the dataset.')
95
- PDBBind(transform=None, root=args.data_dir, limit_complexes=args.limit_complexes,
96
- receptor_radius=args.receptor_radius,
97
- cache_path=args.cache_path, split_path=args.split_val if split == 'val' else args.split_train,
98
- remove_hs=args.remove_hs, max_lig_size=None,
99
- c_alpha_max_neighbors=args.c_alpha_max_neighbors,
100
- matching=not args.no_torsion, keep_original=True,
101
- popsize=args.matching_popsize,
102
- maxiter=args.matching_maxiter,
103
- all_atoms=args.all_atoms,
104
- atom_radius=args.atom_radius,
105
- atom_max_neighbors=args.atom_max_neighbors,
106
- esm_embeddings_path=args.esm_embeddings_path,
107
- require_ligand=True)
108
-
109
- print(f'HAPPENING | Loading complex graphs from: {os.path.join(self.complex_graphs_cache, "heterographs.pkl")}')
110
- with open(os.path.join(self.complex_graphs_cache, "heterographs.pkl"), 'rb') as f:
111
- complex_graphs = pickle.load(f)
112
- self.complex_graph_dict = {d.name: d for d in complex_graphs}
113
-
114
- if self.cache_ids_to_combine is None:
115
- print(f'HAPPENING | Loading positions and rmsds from: {os.path.join(self.full_cache_path, "ligand_positions.pkl")}')
116
- with open(os.path.join(self.full_cache_path, "ligand_positions.pkl"), 'rb') as f:
117
- self.full_ligand_positions, self.rmsds = pickle.load(f)
118
- if os.path.exists(os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl")):
119
- with open(os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl"), 'rb') as f:
120
- generated_rmsd_complex_names = pickle.load(f)
121
- else:
122
- print('HAPPENING | The path, ', os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl"),
123
- ' does not exist. \n => We assume that means that we are using a ligand_positions.pkl where the '
124
- 'code was not saving the complex names for them yet. We now instead use the complex names of '
125
- 'the dataset that the original model used to create the ligand positions and RMSDs.')
126
- with open(os.path.join(original_model_cache, "heterographs.pkl"), 'rb') as f:
127
- original_model_complex_graphs = pickle.load(f)
128
- generated_rmsd_complex_names = [d.name for d in original_model_complex_graphs]
129
- assert (len(self.rmsds) == len(generated_rmsd_complex_names))
130
- else:
131
- all_rmsds_unsorted, all_full_ligand_positions_unsorted, all_names_unsorted = [], [], []
132
- for idx, cache_id in enumerate(self.cache_ids_to_combine):
133
- print(f'HAPPENING | Loading positions and rmsds from cache_id from the path: {os.path.join(self.full_cache_path, "ligand_positions_"+ str(cache_id)+ ".pkl")}')
134
- if not os.path.exists(os.path.join(self.full_cache_path, f"ligand_positions_id{cache_id}.pkl")): raise Exception(f'The generated ligand positions with cache_id do not exist: {cache_id}') # be careful with changing this error message since it is sometimes cought in a try catch
135
- with open(os.path.join(self.full_cache_path, f"ligand_positions_id{cache_id}.pkl"), 'rb') as f:
136
- full_ligand_positions, rmsds = pickle.load(f)
137
- with open(os.path.join(self.full_cache_path, f"complex_names_in_same_order_id{cache_id}.pkl"), 'rb') as f:
138
- names_unsorted = pickle.load(f)
139
- all_names_unsorted.append(names_unsorted)
140
- all_rmsds_unsorted.append(rmsds)
141
- all_full_ligand_positions_unsorted.append(full_ligand_positions)
142
- names_order = list(set(sum(all_names_unsorted, [])))
143
- all_rmsds, all_full_ligand_positions, all_names = [], [], []
144
- for idx, (rmsds_unsorted, full_ligand_positions_unsorted, names_unsorted) in enumerate(zip(all_rmsds_unsorted,all_full_ligand_positions_unsorted, all_names_unsorted)):
145
- name_to_pos_dict = {name: (rmsd, pos) for name, rmsd, pos in zip(names_unsorted, full_ligand_positions_unsorted, rmsds_unsorted) }
146
- intermediate_rmsds = [name_to_pos_dict[name][1] for name in names_order]
147
- all_rmsds.append((intermediate_rmsds))
148
- intermediate_pos = [name_to_pos_dict[name][0] for name in names_order]
149
- all_full_ligand_positions.append((intermediate_pos))
150
- self.full_ligand_positions, self.rmsds = [], []
151
- for positions_tuple in list(zip(*all_full_ligand_positions)):
152
- self.full_ligand_positions.append(np.concatenate(positions_tuple, axis=0))
153
- for positions_tuple in list(zip(*all_rmsds)):
154
- self.rmsds.append(np.concatenate(positions_tuple, axis=0))
155
- generated_rmsd_complex_names = names_order
156
- print('Number of complex graphs: ', len(self.complex_graph_dict))
157
- print('Number of RMSDs and positions for the complex graphs: ', len(self.full_ligand_positions))
158
-
159
- self.all_samples_per_complex = samples_per_complex * (1 if self.cache_ids_to_combine is None else len(self.cache_ids_to_combine))
160
-
161
- self.positions_rmsds_dict = {name: (pos, rmsd) for name, pos, rmsd in zip (generated_rmsd_complex_names, self.full_ligand_positions, self.rmsds)}
162
- self.dataset_names = list(set(self.positions_rmsds_dict.keys()) & set(self.complex_graph_dict.keys()))
163
- if limit_complexes > 0:
164
- self.dataset_names = self.dataset_names[:limit_complexes]
165
-
166
- def len(self):
167
- return len(self.dataset_names)
168
-
169
- def get(self, idx):
170
- complex_graph = copy.deepcopy(self.complex_graph_dict[self.dataset_names[idx]])
171
- positions, rmsds = self.positions_rmsds_dict[self.dataset_names[idx]]
172
-
173
- if self.balance:
174
- if isinstance(self.rmsd_classification_cutoff, list): raise ValueError("a list for --rmsd_classification_cutoff can only be used without --balance")
175
- label = random.randint(0, 1)
176
- success = rmsds < self.rmsd_classification_cutoff
177
- n_success = np.count_nonzero(success)
178
- if label == 0 and n_success != self.all_samples_per_complex:
179
- # sample negative complex
180
- sample = random.randint(0, self.all_samples_per_complex - n_success - 1)
181
- lig_pos = positions[~success][sample]
182
- complex_graph['ligand'].pos = torch.from_numpy(lig_pos)
183
- else:
184
- # sample positive complex
185
- if n_success > 0: # if no successfull sample returns the matched complex
186
- sample = random.randint(0, n_success - 1)
187
- lig_pos = positions[success][sample]
188
- complex_graph['ligand'].pos = torch.from_numpy(lig_pos)
189
- complex_graph.y = torch.tensor(label).float()
190
- else:
191
- sample = random.randint(0, self.all_samples_per_complex - 1)
192
- complex_graph['ligand'].pos = torch.from_numpy(positions[sample])
193
- complex_graph.y = torch.tensor(rmsds[sample] < self.rmsd_classification_cutoff).float().unsqueeze(0)
194
- if isinstance(self.rmsd_classification_cutoff, list):
195
- complex_graph.y_binned = torch.tensor(np.logical_and(rmsds[sample] < self.rmsd_classification_cutoff + [math.inf],rmsds[sample] >= [0] + self.rmsd_classification_cutoff), dtype=torch.float).unsqueeze(0)
196
- complex_graph.y = torch.tensor(rmsds[sample] < self.rmsd_classification_cutoff[0]).unsqueeze(0).float()
197
- complex_graph.rmsd = torch.tensor(rmsds[sample]).unsqueeze(0).float()
198
-
199
- complex_graph['ligand'].node_t = {'tr': 0 * torch.ones(complex_graph['ligand'].num_nodes),
200
- 'rot': 0 * torch.ones(complex_graph['ligand'].num_nodes),
201
- 'tor': 0 * torch.ones(complex_graph['ligand'].num_nodes)}
202
- complex_graph['receptor'].node_t = {'tr': 0 * torch.ones(complex_graph['receptor'].num_nodes),
203
- 'rot': 0 * torch.ones(complex_graph['receptor'].num_nodes),
204
- 'tor': 0 * torch.ones(complex_graph['receptor'].num_nodes)}
205
- if self.all_atoms:
206
- complex_graph['atom'].node_t = {'tr': 0 * torch.ones(complex_graph['atom'].num_nodes),
207
- 'rot': 0 * torch.ones(complex_graph['atom'].num_nodes),
208
- 'tor': 0 * torch.ones(complex_graph['atom'].num_nodes)}
209
- complex_graph.complex_t = {'tr': 0 * torch.ones(1), 'rot': 0 * torch.ones(1), 'tor': 0 * torch.ones(1)}
210
- return complex_graph
211
-
212
- def preprocessing(self, original_model_cache):
213
- t_to_sigma = partial(t_to_sigma_compl, args=self.original_model_args)
214
-
215
- model = get_model(self.original_model_args, self.device, t_to_sigma=t_to_sigma, no_parallel=True)
216
- state_dict = torch.load(f'{self.original_model_dir}/{self.model_ckpt}', map_location=torch.device('cpu'))
217
- model.load_state_dict(state_dict, strict=True)
218
- model = model.to(self.device)
219
- model.eval()
220
-
221
- tr_schedule = get_t_schedule(inference_steps=self.inference_steps)
222
- rot_schedule = tr_schedule
223
- tor_schedule = tr_schedule
224
- print('common t schedule', tr_schedule)
225
-
226
- print('HAPPENING | loading cached complexes of the original model to create the confidence dataset RMSDs and predicted positions. Doing that from: ', os.path.join(self.complex_graphs_cache, "heterographs.pkl"))
227
- with open(os.path.join(original_model_cache, "heterographs.pkl"), 'rb') as f:
228
- complex_graphs = pickle.load(f)
229
- dataset = ListDataset(complex_graphs)
230
- loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False)
231
-
232
- rmsds, full_ligand_positions, names = [], [], []
233
- for idx, orig_complex_graph in tqdm(enumerate(loader)):
234
- data_list = [copy.deepcopy(orig_complex_graph) for _ in range(self.samples_per_complex)]
235
- randomize_position(data_list, self.original_model_args.no_torsion, False, self.original_model_args.tr_sigma_max)
236
-
237
- predictions_list = None
238
- failed_convergence_counter = 0
239
- while predictions_list is None:
240
- try:
241
- predictions_list, confidences = sampling(data_list=data_list, model=model, inference_steps=self.inference_steps,
242
- tr_schedule=tr_schedule, rot_schedule=rot_schedule, tor_schedule=tor_schedule,
243
- device=self.device, t_to_sigma=t_to_sigma, model_args=self.original_model_args)
244
- except Exception as e:
245
- if 'failed to converge' in str(e):
246
- failed_convergence_counter += 1
247
- if failed_convergence_counter > 5:
248
- print('| WARNING: SVD failed to converge 5 times - skipping the complex')
249
- break
250
- print('| WARNING: SVD failed to converge - trying again with a new sample')
251
- else:
252
- raise e
253
- if failed_convergence_counter > 5: predictions_list = data_list
254
- if self.original_model_args.no_torsion:
255
- orig_complex_graph['ligand'].orig_pos = (orig_complex_graph['ligand'].pos.cpu().numpy() + orig_complex_graph.original_center.cpu().numpy())
256
-
257
- filterHs = torch.not_equal(predictions_list[0]['ligand'].x[:, 0], 0).cpu().numpy()
258
-
259
- if isinstance(orig_complex_graph['ligand'].orig_pos, list):
260
- orig_complex_graph['ligand'].orig_pos = orig_complex_graph['ligand'].orig_pos[0]
261
-
262
- ligand_pos = np.asarray([complex_graph['ligand'].pos.cpu().numpy()[filterHs] for complex_graph in predictions_list])
263
- orig_ligand_pos = np.expand_dims(orig_complex_graph['ligand'].orig_pos[filterHs] - orig_complex_graph.original_center.cpu().numpy(), axis=0)
264
- rmsd = np.sqrt(((ligand_pos - orig_ligand_pos) ** 2).sum(axis=2).mean(axis=1))
265
-
266
- rmsds.append(rmsd)
267
- full_ligand_positions.append(np.asarray([complex_graph['ligand'].pos.cpu().numpy() for complex_graph in predictions_list]))
268
- names.append(orig_complex_graph.name[0])
269
- assert(len(orig_complex_graph.name) == 1) # I just put this assert here because of the above line where I assumed that the list is always only lenght 1. Just in case it isn't maybe check what the names in there are.
270
- with open(os.path.join(self.full_cache_path, f"ligand_positions{'' if self.cache_creation_id is None else '_id' + str(self.cache_creation_id)}.pkl"), 'wb') as f:
271
- pickle.dump((full_ligand_positions, rmsds), f)
272
- with open(os.path.join(self.full_cache_path, f"complex_names_in_same_order{'' if self.cache_creation_id is None else '_id' + str(self.cache_creation_id)}.pkl"), 'wb') as f:
273
- pickle.dump((names), f)
274
-
275
-
276
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{confidence → datasets}/__init__.py RENAMED
File without changes
datasets/conformer_matching.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy, time
2
+ import numpy as np
3
+ from collections import defaultdict
4
+ from rdkit import Chem, RDLogger
5
+ from rdkit.Chem import AllChem, rdMolTransforms
6
+ from rdkit import Geometry
7
+ import networkx as nx
8
+ from scipy.optimize import differential_evolution
9
+
10
+ RDLogger.DisableLog('rdApp.*')
11
+
12
+ """
13
+ Conformer matching routines from Torsional Diffusion
14
+ """
15
+
16
+ def GetDihedral(conf, atom_idx):
17
+ return rdMolTransforms.GetDihedralRad(conf, atom_idx[0], atom_idx[1], atom_idx[2], atom_idx[3])
18
+
19
+
20
+ def SetDihedral(conf, atom_idx, new_vale):
21
+ rdMolTransforms.SetDihedralRad(conf, atom_idx[0], atom_idx[1], atom_idx[2], atom_idx[3], new_vale)
22
+
23
+
24
+ def apply_changes(mol, values, rotatable_bonds, conf_id):
25
+ opt_mol = copy.copy(mol)
26
+ [SetDihedral(opt_mol.GetConformer(conf_id), rotatable_bonds[r], values[r]) for r in range(len(rotatable_bonds))]
27
+ return opt_mol
28
+
29
+
30
+ def optimize_rotatable_bonds(mol, true_mol, rotatable_bonds, probe_id=-1, ref_id=-1, seed=0, popsize=15, maxiter=500,
31
+ mutation=(0.5, 1), recombination=0.8):
32
+ opt = OptimizeConformer(mol, true_mol, rotatable_bonds, seed=seed, probe_id=probe_id, ref_id=ref_id)
33
+ max_bound = [np.pi] * len(opt.rotatable_bonds)
34
+ min_bound = [-np.pi] * len(opt.rotatable_bonds)
35
+ bounds = (min_bound, max_bound)
36
+ bounds = list(zip(bounds[0], bounds[1]))
37
+
38
+ # Optimize conformations
39
+ result = differential_evolution(opt.score_conformation, bounds,
40
+ maxiter=maxiter, popsize=popsize,
41
+ mutation=mutation, recombination=recombination, disp=False, seed=seed)
42
+ opt_mol = apply_changes(opt.mol, result['x'], opt.rotatable_bonds, conf_id=probe_id)
43
+
44
+ return opt_mol
45
+
46
+
47
+ class OptimizeConformer:
48
+ def __init__(self, mol, true_mol, rotatable_bonds, probe_id=-1, ref_id=-1, seed=None):
49
+ super(OptimizeConformer, self).__init__()
50
+ if seed:
51
+ np.random.seed(seed)
52
+ self.rotatable_bonds = rotatable_bonds
53
+ self.mol = mol
54
+ self.true_mol = true_mol
55
+ self.probe_id = probe_id
56
+ self.ref_id = ref_id
57
+
58
+ def score_conformation(self, values):
59
+ for i, r in enumerate(self.rotatable_bonds):
60
+ SetDihedral(self.mol.GetConformer(self.probe_id), r, values[i])
61
+ return AllChem.AlignMol(self.mol, self.true_mol, self.probe_id, self.ref_id)
62
+
63
+
64
+ def get_torsion_angles(mol):
65
+ torsions_list = []
66
+ G = nx.Graph()
67
+ for i, atom in enumerate(mol.GetAtoms()):
68
+ G.add_node(i)
69
+ nodes = set(G.nodes())
70
+ for bond in mol.GetBonds():
71
+ start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
72
+ G.add_edge(start, end)
73
+ for e in G.edges():
74
+ G2 = copy.deepcopy(G)
75
+ G2.remove_edge(*e)
76
+ if nx.is_connected(G2): continue
77
+ l = list(sorted(nx.connected_components(G2), key=len)[0])
78
+ if len(l) < 2: continue
79
+ n0 = list(G2.neighbors(e[0]))
80
+ n1 = list(G2.neighbors(e[1]))
81
+ torsions_list.append(
82
+ (n0[0], e[0], e[1], n1[0])
83
+ )
84
+ return torsions_list
85
+
datasets/constants.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Significant contribution from Ben Fry and Nick Polizzi
2
+
3
+ three_to_one = {'ALA': 'A',
4
+ 'ARG': 'R',
5
+ 'ASN': 'N',
6
+ 'ASP': 'D',
7
+ 'CYS': 'C',
8
+ 'GLN': 'Q',
9
+ 'GLU': 'E',
10
+ 'GLY': 'G',
11
+ 'HIS': 'H',
12
+ 'ILE': 'I',
13
+ 'LEU': 'L',
14
+ 'LYS': 'K',
15
+ 'MET': 'M',
16
+ 'MSE': 'M', # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
17
+ 'PHE': 'F',
18
+ 'PRO': 'P',
19
+ 'PYL': 'O',
20
+ 'SER': 'S',
21
+ 'SEC': 'U',
22
+ 'THR': 'T',
23
+ 'TRP': 'W',
24
+ 'TYR': 'Y',
25
+ 'VAL': 'V',
26
+ 'ASX': 'B',
27
+ 'GLX': 'Z',
28
+ 'XAA': 'X',
29
+ 'XLE': 'J'}
30
+
31
+
32
+ aa_name2aa_idx = {'ALA': 0, 'ARG': 1, 'ASN': 2, 'ASP': 3, 'CYS': 4, 'GLU': 5, 'GLN': 6, 'GLY': 7,
33
+ 'HIS': 8, 'ILE': 9, 'LEU': 10, 'LYS': 11, 'MET': 12, 'PHE': 13, 'PRO': 14,
34
+ 'SER': 15, 'THR': 16, 'TRP': 17, 'TYR': 18, 'VAL': 19, 'MSE': 12}
35
+
36
+
37
+ aa_short2long = {'C': 'CYS', 'D': 'ASP', 'S': 'SER', 'Q': 'GLN', 'K': 'LYS', 'I': 'ILE',
38
+ 'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN', 'G': 'GLY', 'H': 'HIS',
39
+ 'L': 'LEU', 'R': 'ARG', 'W': 'TRP', 'A': 'ALA', 'V': 'VAL', 'E': 'GLU',
40
+ 'Y': 'TYR', 'M': 'MET'}
41
+
42
+
43
+ aa_short2aa_idx = {aa_short: aa_name2aa_idx[aa_long] for aa_short, aa_long in aa_short2long.items()}
44
+ aa_idx2aa_short = {aa_idx: aa_short for aa_short, aa_idx in aa_short2aa_idx.items()}
45
+ aa_long2short = {aa_long: aa_short for aa_short, aa_long in aa_short2long.items()}
46
+ aa_long2short['MSE'] = 'M'
47
+
48
+ chi = { 'C' :
49
+ { 1: ('N' , 'CA' , 'CB' , 'SG' ) },
50
+ 'D' :
51
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
52
+ 2: ('CA' , 'CB' , 'CG' , 'OD1'), },
53
+ 'E' :
54
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
55
+ 2: ('CA' , 'CB' , 'CG' , 'CD' ),
56
+ 3: ('CB' , 'CG' , 'CD' , 'OE1'), },
57
+ 'F' :
58
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
59
+ 2: ('CA' , 'CB' , 'CG' , 'CD1'), },
60
+ 'H' :
61
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
62
+ 2: ('CA' , 'CB' , 'CG' , 'ND1'), },
63
+ 'I' :
64
+ { 1: ('N' , 'CA' , 'CB' , 'CG1'),
65
+ 2: ('CA' , 'CB' , 'CG1', 'CD1'), },
66
+ 'K' :
67
+ { 1: ('N' , 'CA' , 'CB' ,'CG' ),
68
+ 2: ('CA' , 'CB' , 'CG' ,'CD' ),
69
+ 3: ('CB' , 'CG' , 'CD' ,'CE' ),
70
+ 4: ('CG' , 'CD' , 'CE' ,'NZ' ), },
71
+ 'L' :
72
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
73
+ 2: ('CA' , 'CB' , 'CG' , 'CD1'), },
74
+ 'M' :
75
+ { 1: ('N' , 'CA' , 'CB' ,'CG' ),
76
+ 2: ('CA' , 'CB' , 'CG' ,'SD' ),
77
+ 3: ('CB' , 'CG' , 'SD' ,'CE' ), },
78
+ 'N' :
79
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
80
+ 2: ('CA' , 'CB' , 'CG' , 'OD1'), },
81
+ 'P' :
82
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
83
+ 2: ('CA' , 'CB' , 'CG' , 'CD' ), },
84
+ 'Q' :
85
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
86
+ 2: ('CA' , 'CB' , 'CG' , 'CD' ),
87
+ 3: ('CB' , 'CG' , 'CD' , 'OE1'), },
88
+ 'R' :
89
+ { 1: ('N' , 'CA' , 'CB' ,'CG' ),
90
+ 2: ('CA' , 'CB' , 'CG' ,'CD' ),
91
+ 3: ('CB' , 'CG' , 'CD' ,'NE' ),
92
+ 4: ('CG' , 'CD' , 'NE' ,'CZ' ), },
93
+ 'S' :
94
+ { 1: ('N' , 'CA' , 'CB' , 'OG' ), },
95
+ 'T' :
96
+ { 1: ('N' , 'CA' , 'CB' , 'OG1'), },
97
+ 'V' :
98
+ { 1: ('N' , 'CA' , 'CB' , 'CG1'), },
99
+ 'W' :
100
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
101
+ 2: ('CA' , 'CB' , 'CG' , 'CD1'), },
102
+ 'Y' :
103
+ { 1: ('N' , 'CA' , 'CB' , 'CG' ),
104
+ 2: ('CA' , 'CB' , 'CG' , 'CD1'), },
105
+ }
106
+
107
+
108
+ atom_order = {'G': ['N', 'CA', 'C', 'O'],
109
+ 'A': ['N', 'CA', 'C', 'O', 'CB'],
110
+ 'S': ['N', 'CA', 'C', 'O', 'CB', 'OG'],
111
+ 'C': ['N', 'CA', 'C', 'O', 'CB', 'SG'],
112
+ 'T': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2'],
113
+ 'P': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD'],
114
+ 'V': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2'],
115
+ 'M': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE'],
116
+ 'N': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2'],
117
+ 'I': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1'],
118
+ 'L': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2'],
119
+ 'D': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2'],
120
+ 'E': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2'],
121
+ 'K': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ'],
122
+ 'Q': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2'],
123
+ 'H': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2'],
124
+ 'F': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'],
125
+ 'R': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2'],
126
+ 'Y': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH'],
127
+ 'W': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'NE1', 'CZ2', 'CZ3', 'CH2'],
128
+ 'X': ['N', 'CA', 'C', 'O']} # unknown amino acid
129
+
130
+
131
+
132
+ amino_acid_smiles = {
133
+ 'PHE': '[NH3+]CC(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)O',
134
+ 'MET': 'CSCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
135
+ 'TYR': '[NH3+]CC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)NCC(=O)O',
136
+ 'ILE': 'CC[C@H](C)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
137
+ 'TRP': '[NH3+]CC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCC(=O)O',
138
+ 'THR': 'C[C@@H](O)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
139
+ 'CYS': '[NH3+]CC(=O)N[C@@H](CS)C(=O)NCC(=O)O',
140
+ 'ALA': 'C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
141
+ 'LYS': '[NH3+]CCCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
142
+ 'PRO': '[NH3+]CC(=O)N1CCC[C@H]1C(=O)NCC(=O)O',
143
+ 'LEU': 'CC(C)C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
144
+ 'GLY': '[NH3+]CC(=O)NCC(=O)NCC(=O)O',
145
+ 'ASP': '[NH3+]CC(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)O',
146
+ 'HIS': '[NH3+]CC(=O)N[C@@H](Cc1c[nH]c[nH+]1)C(=O)NCC(=O)O',
147
+ 'VAL': 'CC(C)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
148
+ 'SER': '[NH3+]CC(=O)N[C@@H](CO)C(=O)NCC(=O)O',
149
+ 'ARG': 'NC(=[NH2+])NCCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
150
+ 'GLU': '[NH3+]CC(=O)N[C@@H](CCC(=O)O)C(=O)NCC(=O)O',
151
+ 'GLN': 'NC(=O)CC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
152
+ 'ASN': 'NC(=O)C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
153
+ }
154
+
155
+ cg_rdkit_indices = {
156
+ 'PHE': {4: 'N', 5: 'CA', 13: 'C', 14: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 12: 'CD2', 9: 'CE1', 11: 'CE2', 10: 'CZ'},
157
+ 'MET': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 2: 'CG', 1: 'SD', 0: 'CE'},
158
+ 'TYR': {4: 'N', 5: 'CA', 14: 'C', 15: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 13: 'CD2', 9: 'CE1', 12: 'CE2', 10: 'CZ', 11: 'OH'},
159
+ 'ILE': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 2: 'CB', 1: 'CG1', 3: 'CG2', 0: 'CD1'},
160
+ 'TRP': {4: 'N', 5: 'CA', 16: 'C', 17: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 15: 'CD2', 9: 'NE1', 10: 'CE2', 14: 'CE3', 11: 'CZ2', 13: 'CZ3', 12: 'CH2'},
161
+ 'THR': {4: 'N', 3: 'CA', 9: 'C', 10: 'O', 1: 'CB', 2: 'OG1', 0: 'CG2'},
162
+ 'CYS': {4: 'N', 5: 'CA', 8: 'C', 9: 'O', 6: 'CB', 7: 'SG'},
163
+ 'ALA': {2: 'N', 1: 'CA', 7: 'C', 8: 'O', 0: 'CB'},
164
+ 'LYS': {6: 'N', 5: 'CA', 11: 'C', 12: 'O', 4: 'CB', 3: 'CG', 2: 'CD', 1: 'CE', 0: 'NZ'},
165
+ 'PRO': {4: 'N', 8: 'CA', 9: 'C', 10: 'O', 7: 'CB', 6: 'CG', 5: 'CD'},
166
+ 'LEU': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 1: 'CG', 0: 'CD1', 2: 'CD2'},
167
+ 'GLY': {4: 'N', 5: 'CA', 6: 'C', 7: 'O'},
168
+ 'ASP': {4: 'N', 5: 'CA', 10: 'C', 11: 'O', 6: 'CB', 7: 'CG', 8: 'OD1', 9: 'OD2'},
169
+ 'HIS': {4: 'N', 5: 'CA', 12: 'C', 13: 'O', 6: 'CB', 7: 'CG', 11: 'ND1', 8: 'CD2', 10: 'CE1', 9: 'NE2'},
170
+ 'VAL': {4: 'N', 3: 'CA', 9: 'C', 10: 'O', 1: 'CB', 0: 'CG1', 2: 'CG2'},
171
+ 'SER': {4: 'N', 5: 'CA', 8: 'C', 9: 'O', 6: 'CB', 7: 'OG'},
172
+ 'ARG': {8: 'N', 7: 'CA', 13: 'C', 14: 'O', 6: 'CB', 5: 'CG', 4: 'CD', 3: 'NE', 1: 'CZ', 0: 'NH1', 2: 'NH2'},
173
+ 'GLU': {4: 'N', 5: 'CA', 11: 'C', 12: 'O', 6: 'CB', 7: 'CG', 8: 'CD', 9: 'OE1', 10: 'OE2'},
174
+ 'GLN': {6: 'N', 5: 'CA', 11: 'C', 12: 'O', 4: 'CB', 3: 'CG', 1: 'CD', 2: 'OE1', 0: 'NE2'},
175
+ 'ASN': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 1: 'CG', 2: 'OD1', 0: 'ND2'}
176
+ }
177
+
178
+ aa_to_cg_indices = {aa_long2short[x]: [atom_order[aa_long2short[x]].index(aname) for aname in index_dict.values()] for x, index_dict in cg_rdkit_indices.items()}
179
+
datasets/dataloader.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Mapping, Sequence
2
+ from typing import List, Optional, Union
3
+
4
+ import torch.utils.data
5
+ from torch.utils.data.dataloader import default_collate
6
+
7
+ from torch_geometric.data import Batch, Dataset
8
+ from torch_geometric.data.data import BaseData
9
+
10
+
11
+ class Collater:
12
+ def __init__(self, follow_batch, exclude_keys):
13
+ self.follow_batch = follow_batch
14
+ self.exclude_keys = exclude_keys
15
+
16
+ def __call__(self, batch):
17
+ batch = [x for x in batch if x is not None]
18
+ elem = batch[0]
19
+ if isinstance(elem, BaseData):
20
+ return Batch.from_data_list(batch, self.follow_batch,
21
+ self.exclude_keys)
22
+ elif isinstance(elem, torch.Tensor):
23
+ return default_collate(batch)
24
+ elif isinstance(elem, float):
25
+ return torch.tensor(batch, dtype=torch.float)
26
+ elif isinstance(elem, int):
27
+ return torch.tensor(batch)
28
+ elif isinstance(elem, str):
29
+ return batch
30
+ elif isinstance(elem, Mapping):
31
+ return {key: self([data[key] for data in batch]) for key in elem}
32
+ elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
33
+ return type(elem)(*(self(s) for s in zip(*batch)))
34
+ elif isinstance(elem, Sequence) and not isinstance(elem, str):
35
+ return [self(s) for s in zip(*batch)]
36
+
37
+ raise TypeError(f'DataLoader found invalid type: {type(elem)}')
38
+
39
+ def collate(self, batch): # Deprecated...
40
+ return self(batch)
41
+
42
+
43
+ class DataLoader(torch.utils.data.DataLoader):
44
+ r"""A data loader which merges data objects from a
45
+ :class:`torch_geometric.data.Dataset` to a mini-batch.
46
+ Data objects can be either of type :class:`~torch_geometric.data.Data` or
47
+ :class:`~torch_geometric.data.HeteroData`.
48
+
49
+ Args:
50
+ dataset (Dataset): The dataset from which to load the data.
51
+ batch_size (int, optional): How many samples per batch to load.
52
+ (default: :obj:`1`)
53
+ shuffle (bool, optional): If set to :obj:`True`, the data will be
54
+ reshuffled at every epoch. (default: :obj:`False`)
55
+ follow_batch (List[str], optional): Creates assignment batch
56
+ vectors for each key in the list. (default: :obj:`None`)
57
+ exclude_keys (List[str], optional): Will exclude each key in the
58
+ list. (default: :obj:`None`)
59
+ **kwargs (optional): Additional arguments of
60
+ :class:`torch.utils.data.DataLoader`.
61
+ """
62
+ def __init__(
63
+ self,
64
+ dataset: Union[Dataset, List[BaseData]],
65
+ batch_size: int = 1,
66
+ shuffle: bool = False,
67
+ follow_batch: Optional[List[str]] = None,
68
+ exclude_keys: Optional[List[str]] = None,
69
+ **kwargs,
70
+ ):
71
+
72
+ if 'collate_fn' in kwargs:
73
+ del kwargs['collate_fn']
74
+
75
+ # Save for PyTorch Lightning:
76
+ self.follow_batch = follow_batch
77
+ self.exclude_keys = exclude_keys
78
+
79
+ super().__init__(
80
+ dataset,
81
+ batch_size,
82
+ shuffle,
83
+ collate_fn=Collater(follow_batch, exclude_keys),
84
+ **kwargs,
85
+ )
86
+
87
+
88
+ def collate_fn(data_list):
89
+ data_list = [x for x in data_list if x is not None]
90
+ return data_list
91
+
92
+
93
+ class DataListLoader(torch.utils.data.DataLoader):
94
+ def __init__(self, dataset: Union[Dataset, List[BaseData]],
95
+ batch_size: int = 1, shuffle: bool = False, **kwargs):
96
+ if 'collate_fn' in kwargs:
97
+ del kwargs['collate_fn']
98
+
99
+ super().__init__(dataset, batch_size=batch_size, shuffle=shuffle,
100
+ collate_fn=collate_fn, **kwargs)
101
+
datasets/esm_embedding_preparation.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from argparse import ArgumentParser
4
+ from Bio.PDB import PDBParser
5
+ from Bio.Seq import Seq
6
+ from Bio.SeqRecord import SeqRecord
7
+ from tqdm import tqdm
8
+ from Bio import SeqIO
9
+
10
+ from datasets.constants import three_to_one
11
+
12
+ parser = ArgumentParser()
13
+ parser.add_argument('--out_file', type=str, default="data/prepared_for_esm.fasta")
14
+ parser.add_argument('--dataset', type=str, default="pdbbind")
15
+ parser.add_argument('--data_dir', type=str, default='../data/BindingMOAD_2020_ab_processed_biounit/pdb_protein/', help='')
16
+ args = parser.parse_args()
17
+
18
+ biopython_parser = PDBParser()
19
+
20
+
21
+ def get_structure_from_file(file_path):
22
+ structure = biopython_parser.get_structure('random_id', file_path)
23
+ structure = structure[0]
24
+ l = []
25
+ for i, chain in enumerate(structure):
26
+ seq = ''
27
+ for res_idx, residue in enumerate(chain):
28
+ if residue.get_resname() == 'HOH':
29
+ continue
30
+ residue_coords = []
31
+ c_alpha, n, c = None, None, None
32
+ for atom in residue:
33
+ if atom.name == 'CA':
34
+ c_alpha = list(atom.get_vector())
35
+ if atom.name == 'N':
36
+ n = list(atom.get_vector())
37
+ if atom.name == 'C':
38
+ c = list(atom.get_vector())
39
+ if c_alpha != None and n != None and c != None: # only append residue if it is an amino acid
40
+ try:
41
+ seq += three_to_one[residue.get_resname()]
42
+ except Exception as e:
43
+ seq += '-'
44
+ print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', file_path, '. Replacing it with a dash - .')
45
+ l.append(seq)
46
+ return l
47
+
48
+ data_dir = args.data_dir
49
+ names = os.listdir(data_dir)
50
+
51
+ if args.dataset == 'pdbbind':
52
+ sequences = []
53
+ ids = []
54
+
55
+ for name in tqdm(names):
56
+ if name == '.DS_Store': continue
57
+ if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')):
58
+ rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb')
59
+ else:
60
+ rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
61
+ l = get_structure_from_file(rec_path)
62
+ for i, seq in enumerate(l):
63
+ sequences.append(seq)
64
+ ids.append(f'{name}_chain_{i}')
65
+ records = []
66
+ for (index, seq) in zip(ids, sequences):
67
+ record = SeqRecord(Seq(seq), str(index))
68
+ record.description = ''
69
+ records.append(record)
70
+ SeqIO.write(records, args.out_file, "fasta")
71
+
72
+ elif args.dataset == 'moad':
73
+ names = [n[:6] for n in names]
74
+ name_to_sequence = {}
75
+
76
+ for name in tqdm(names):
77
+ if name == '.DS_Store': continue
78
+ if not os.path.exists(os.path.join(data_dir, f'{name}_protein.pdb')):
79
+ print(f"We are skipping {name} because there was no {name}_protein.pdb")
80
+ continue
81
+ rec_path = os.path.join(data_dir, f'{name}_protein.pdb')
82
+ l = get_structure_from_file(rec_path)
83
+ for i, seq in enumerate(l):
84
+ name_to_sequence[name + '_chain_' + str(i)] = seq
85
+
86
+ # save to file
87
+ with open(args.out_file, 'wb') as f:
88
+ pickle.dump(name_to_sequence, f)
89
+
datasets/esm_embeddings_to_pt.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from argparse import ArgumentParser
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+
8
+ parser = ArgumentParser()
9
+ parser.add_argument('--esm_embeddings_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new', help='')
10
+ parser.add_argument('--output_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new.pt', help='')
11
+ args = parser.parse_args()
12
+
13
+ dict = {}
14
+ for filename in tqdm(os.listdir(args.esm_embeddings_path)):
15
+ dict[filename.split('.')[0]] = torch.load(os.path.join(args.esm_embeddings_path,filename))['representations'][33]
16
+ torch.save(dict,args.output_path)
datasets/loader.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch_geometric.data import Dataset
3
+
4
+ from datasets.dataloader import DataLoader, DataListLoader
5
+ from datasets.moad import MOAD
6
+ from datasets.pdb import PDBSidechain
7
+ from datasets.pdbbind import NoiseTransform, PDBBind
8
+ from utils.utils import read_strings_from_txt
9
+
10
+
11
+ class CombineDatasets(Dataset):
12
+ def __init__(self, dataset1, dataset2):
13
+ super(CombineDatasets, self).__init__()
14
+ self.dataset1 = dataset1
15
+ self.dataset2 = dataset2
16
+
17
+ def len(self):
18
+ return len(self.dataset1) + len(self.dataset2)
19
+
20
+ def get(self, idx):
21
+ if idx < len(self.dataset1):
22
+ return self.dataset1[idx]
23
+ else:
24
+ return self.dataset2[idx - len(self.dataset1)]
25
+
26
+ def add_complexes(self, new_complex_list):
27
+ self.dataset1.add_complexes(new_complex_list)
28
+
29
+
30
+ def construct_loader(args, t_to_sigma, device):
31
+ val_dataset2 = None
32
+ transform = NoiseTransform(t_to_sigma=t_to_sigma, no_torsion=args.no_torsion,
33
+ all_atom=args.all_atoms, alpha=args.sampling_alpha, beta=args.sampling_beta,
34
+ include_miscellaneous_atoms=False if not hasattr(args, 'include_miscellaneous_atoms') else args.include_miscellaneous_atoms,
35
+ crop_beyond_cutoff=args.crop_beyond)
36
+ if args.triple_training: assert args.combined_training
37
+
38
+ sequences_to_embeddings = None
39
+ if args.dataset == 'pdbsidechain' or args.triple_training:
40
+ if args.pdbsidechain_esm_embeddings_path is not None:
41
+ print('Loading ESM embeddings')
42
+ id_to_embeddings = torch.load(args.pdbsidechain_esm_embeddings_path)
43
+ sequences_list = read_strings_from_txt(args.pdbsidechain_esm_embeddings_sequences_path)
44
+ sequences_to_embeddings = {}
45
+ for i, seq in enumerate(sequences_list):
46
+ if str(i) in id_to_embeddings:
47
+ sequences_to_embeddings[seq] = id_to_embeddings[str(i)]
48
+
49
+ if args.dataset == 'pdbsidechain' or args.triple_training:
50
+
51
+ common_args = {'root': args.pdbsidechain_dir, 'transform': transform, 'limit_complexes': args.limit_complexes,
52
+ 'receptor_radius': args.receptor_radius,
53
+ 'c_alpha_max_neighbors': args.c_alpha_max_neighbors,
54
+ 'remove_hs': args.remove_hs, 'num_workers': args.num_workers, 'all_atoms': args.all_atoms,
55
+ 'atom_radius': args.atom_radius, 'atom_max_neighbors': args.atom_max_neighbors,
56
+ 'knn_only_graph': not args.not_knn_only_graph, 'sequences_to_embeddings': sequences_to_embeddings,
57
+ 'vandermers_max_dist': args.vandermers_max_dist,
58
+ 'vandermers_buffer_residue_num': args.vandermers_buffer_residue_num,
59
+ 'vandermers_min_contacts': args.vandermers_min_contacts,
60
+ 'remove_second_segment': args.remove_second_segment,
61
+ 'merge_clusters': args.merge_clusters}
62
+ train_dataset3 = PDBSidechain(cache_path=args.cache_path, split='train', multiplicity=args.train_multiplicity, **common_args)
63
+
64
+ if args.dataset == 'pdbsidechain':
65
+ train_dataset = train_dataset3
66
+ val_dataset = PDBSidechain(cache_path=args.cache_path, split='val', multiplicity=args.val_multiplicity, **common_args)
67
+ loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
68
+
69
+ if args.dataset in ['pdbbind', 'moad', 'generalisation', 'distillation']:
70
+ common_args = {'transform': transform, 'limit_complexes': args.limit_complexes,
71
+ 'chain_cutoff': args.chain_cutoff, 'receptor_radius': args.receptor_radius,
72
+ 'c_alpha_max_neighbors': args.c_alpha_max_neighbors,
73
+ 'remove_hs': args.remove_hs, 'max_lig_size': args.max_lig_size,
74
+ 'matching': not args.no_torsion, 'popsize': args.matching_popsize, 'maxiter': args.matching_maxiter,
75
+ 'num_workers': args.num_workers, 'all_atoms': args.all_atoms,
76
+ 'atom_radius': args.atom_radius, 'atom_max_neighbors': args.atom_max_neighbors,
77
+ 'knn_only_graph': False if not hasattr(args, 'not_knn_only_graph') else not args.not_knn_only_graph,
78
+ 'include_miscellaneous_atoms': False if not hasattr(args, 'include_miscellaneous_atoms') else args.include_miscellaneous_atoms,
79
+ 'matching_tries': args.matching_tries}
80
+
81
+ if args.dataset == 'pdbbind' or args.dataset == 'generalisation' or args.combined_training:
82
+ train_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_train, keep_original=True,
83
+ num_conformers=args.num_conformers, root=args.pdbbind_dir,
84
+ esm_embeddings_path=args.pdbbind_esm_embeddings_path,
85
+ protein_file=args.protein_file, **common_args)
86
+
87
+ if args.dataset == 'moad' or args.combined_training:
88
+ train_dataset2 = MOAD(cache_path=args.cache_path, split='train', keep_original=True,
89
+ num_conformers=args.num_conformers, max_receptor_size=args.max_receptor_size,
90
+ remove_promiscuous_targets=args.remove_promiscuous_targets, min_ligand_size=args.min_ligand_size,
91
+ multiplicity= args.train_multiplicity, unroll_clusters=args.unroll_clusters,
92
+ esm_embeddings_sequences_path=args.moad_esm_embeddings_sequences_path,
93
+ root=args.moad_dir, esm_embeddings_path=args.moad_esm_embeddings_path,
94
+ enforce_timesplit=args.enforce_timesplit, **common_args)
95
+
96
+ if args.combined_training:
97
+ train_dataset = CombineDatasets(train_dataset2, train_dataset)
98
+ if args.triple_training:
99
+ train_dataset = CombineDatasets(train_dataset, train_dataset3)
100
+ else:
101
+ train_dataset = train_dataset2
102
+
103
+ if args.dataset == 'pdbbind' or args.double_val:
104
+ val_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_val, keep_original=True,
105
+ esm_embeddings_path=args.pdbbind_esm_embeddings_path, root=args.pdbbind_dir,
106
+ protein_file=args.protein_file, require_ligand=True, **common_args)
107
+ if args.double_val:
108
+ val_dataset2 = val_dataset
109
+
110
+ if args.dataset == 'moad' or args.dataset == 'generalisation':
111
+ val_dataset = MOAD(cache_path=args.cache_path, split='val', keep_original=True,
112
+ multiplicity= args.val_multiplicity, max_receptor_size=args.max_receptor_size,
113
+ remove_promiscuous_targets=args.remove_promiscuous_targets, min_ligand_size=args.min_ligand_size,
114
+ esm_embeddings_sequences_path=args.moad_esm_embeddings_sequences_path,
115
+ unroll_clusters=args.unroll_clusters, root=args.moad_dir,
116
+ esm_embeddings_path=args.moad_esm_embeddings_path, require_ligand=True, **common_args)
117
+
118
+ loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
119
+
120
+ train_loader = loader_class(dataset=train_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=True, pin_memory=args.pin_memory, drop_last=args.dataloader_drop_last)
121
+ val_loader = loader_class(dataset=val_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=False, pin_memory=args.pin_memory, drop_last=args.dataloader_drop_last)
122
+ return train_loader, val_loader, val_dataset2
123
+
datasets/moad.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ from multiprocessing import Pool
4
+ import random
5
+ import copy
6
+ from torch_geometric.data import Batch
7
+
8
+ import numpy as np
9
+ import torch
10
+ from prody import confProDy
11
+ from rdkit import Chem
12
+ from rdkit.Chem import RemoveHs
13
+ from torch_geometric.data import Dataset, HeteroData
14
+ from torch_geometric.utils import subgraph
15
+ from tqdm import tqdm
16
+ confProDy(verbosity='none')
17
+ from datasets.process_mols import get_lig_graph_with_matching, moad_extract_receptor_structure
18
+ from utils.utils import read_strings_from_txt
19
+
20
+ class MOAD(Dataset):
21
+ def __init__(self, root, transform=None, cache_path='data/cache', split='train', limit_complexes=0, chain_cutoff=None,
22
+ receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, popsize=15, maxiter=15,
23
+ matching=True, keep_original=False, max_lig_size=None, remove_hs=False, num_conformers=1, all_atoms=False,
24
+ atom_radius=5, atom_max_neighbors=None, esm_embeddings_path=None, esm_embeddings_sequences_path=None, require_ligand=False,
25
+ include_miscellaneous_atoms=False, keep_local_structures=False,
26
+ min_ligand_size=0, knn_only_graph=False, matching_tries=1, multiplicity=1,
27
+ max_receptor_size=None, remove_promiscuous_targets=None, unroll_clusters=False, remove_pdbbind=False,
28
+ enforce_timesplit=False, no_randomness=False, single_cluster_name=None, total_dataset_size=None, skip_matching=False):
29
+
30
+ super(MOAD, self).__init__(root, transform)
31
+ self.moad_dir = root
32
+ self.include_miscellaneous_atoms = include_miscellaneous_atoms
33
+ self.max_lig_size = max_lig_size
34
+ self.split = split
35
+ self.limit_complexes = limit_complexes
36
+ self.receptor_radius = receptor_radius
37
+ self.num_workers = num_workers
38
+ self.c_alpha_max_neighbors = c_alpha_max_neighbors
39
+ self.remove_hs = remove_hs
40
+ self.require_ligand = require_ligand
41
+ self.esm_embeddings_path = esm_embeddings_path
42
+ self.esm_embeddings_sequences_path = esm_embeddings_sequences_path
43
+ self.keep_local_structures = keep_local_structures
44
+ self.knn_only_graph = knn_only_graph
45
+ self.matching_tries = matching_tries
46
+ self.all_atoms = all_atoms
47
+ self.multiplicity = multiplicity
48
+ self.chain_cutoff = chain_cutoff
49
+ self.no_randomness = no_randomness
50
+ self.total_dataset_size = total_dataset_size
51
+ self.skip_matching = skip_matching
52
+
53
+ self.prot_cache_path = os.path.join(cache_path, f'MOAD12_limit{self.limit_complexes}_INDEX{self.split}'
54
+ f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
55
+ + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
56
+ + ('' if self.esm_embeddings_path is None else f'_esmEmbeddings')
57
+ + ('' if not self.include_miscellaneous_atoms else '_miscAtoms')
58
+ + ('' if not self.knn_only_graph else '_knnOnly'))
59
+
60
+ self.lig_cache_path = os.path.join(cache_path, f'MOAD12_limit{self.limit_complexes}_INDEX{self.split}'
61
+ f'_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}'
62
+ + ('' if not matching else f'_matching')
63
+ + ('' if not skip_matching else f'skip')
64
+ + (''if not matching or num_conformers == 1 else f'_confs{num_conformers}')
65
+ + ('' if not keep_local_structures else f'_keptLocalStruct')
66
+ + ('' if self.matching_tries == 1 else f'_tries{matching_tries}'))
67
+
68
+ self.popsize, self.maxiter = popsize, maxiter
69
+ self.matching, self.keep_original = matching, keep_original
70
+ self.num_conformers = num_conformers
71
+ self.single_cluster_name = single_cluster_name
72
+ if split == 'train':
73
+ split = 'PDBBind'
74
+
75
+ with open("./data/splits/MOAD_generalisation_splits.pkl", "rb") as f:
76
+ self.split_clusters = pickle.load(f)[split]
77
+
78
+ clustes_path = os.path.join(self.moad_dir, "new_cluster_to_ligands.pkl")
79
+ with open(clustes_path, "rb") as f:
80
+ self.cluster_to_ligands = pickle.load(f)
81
+ #self.cluster_to_ligands = {k: [s.split('.')[0] for s in v] for k, v in self.cluster_to_ligands.items()}
82
+
83
+ self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
84
+ if not self.check_all_receptors():
85
+ os.makedirs(self.prot_cache_path, exist_ok=True)
86
+ self.preprocessing_receptors()
87
+
88
+ self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
89
+ if not os.path.exists(os.path.join(self.lig_cache_path, "ligands.pkl")):
90
+ os.makedirs(self.lig_cache_path, exist_ok=True)
91
+ self.preprocessing_ligands()
92
+
93
+ print('loading ligands from memory: ', os.path.join(self.lig_cache_path, "ligands.pkl"))
94
+ with open(os.path.join(self.lig_cache_path, "ligands.pkl"), 'rb') as f:
95
+ self.ligands = pickle.load(f)
96
+
97
+ if require_ligand:
98
+ with open(os.path.join(self.lig_cache_path, "rdkit_ligands.pkl"), 'rb') as f:
99
+ self.rdkit_ligands = pickle.load(f)
100
+ self.rdkit_ligands = {lig.name:mol for mol, lig in zip(self.rdkit_ligands, self.ligands)}
101
+
102
+ len_before = len(self.ligands)
103
+ if not self.single_cluster_name is None:
104
+ self.ligands = [lig for lig in self.ligands if lig.name in self.cluster_to_ligands[self.single_cluster_name]]
105
+ print('Kept', len(self.ligands), f'ligands in {self.single_cluster_name} out of', len_before)
106
+
107
+ len_before = len(self.ligands)
108
+ self.ligands = {lig.name: lig for lig in self.ligands if min_ligand_size == 0 or lig['ligand'].x.shape[0] >= min_ligand_size}
109
+ print('removed', len_before - len(self.ligands), 'ligands below minimum size out of', len_before)
110
+
111
+ receptors_names = set([lig.name[:6] for lig in self.ligands.values()])
112
+ self.collect_receptors(receptors_names, max_receptor_size, remove_promiscuous_targets)
113
+
114
+ # filter ligands for which the receptor failed
115
+ tot_before = len(self.ligands)
116
+ self.ligands = {k:v for k, v in self.ligands.items() if k[:6] in self.receptors}
117
+ print('removed', tot_before - len(self.ligands), 'ligands with no receptor out of', tot_before)
118
+
119
+ if remove_pdbbind:
120
+ complexes_pdbbind = read_strings_from_txt('data/splits/timesplit_no_lig_overlap_train') + read_strings_from_txt('data/splits/timesplit_no_lig_overlap_val')
121
+ with open('data/BindingMOAD_2020_ab_processed_biounit/ecod_t_group_binding_site_assignment_dict_major_domain.pkl', 'rb') as f:
122
+ pdbbind_to_cluster = pickle.load(f)
123
+ clusters_pdbbind = set([pdbbind_to_cluster[c] for c in complexes_pdbbind])
124
+ self.split_clusters = [c for c in self.split_clusters if c not in clusters_pdbbind]
125
+ self.cluster_to_ligands = {k: v for k, v in self.cluster_to_ligands.items() if k not in clusters_pdbbind}
126
+ ligand_accepted = []
127
+ for c, ligands in self.cluster_to_ligands.items():
128
+ ligand_accepted += ligands
129
+ ligand_accepted = set(ligand_accepted)
130
+ tot_before = len(self.ligands)
131
+ self.ligands = {k: v for k, v in self.ligands.items() if k in ligand_accepted}
132
+ print('removed', tot_before - len(self.ligands), 'ligands in overlap with PDBBind out of', tot_before)
133
+
134
+ if enforce_timesplit:
135
+ with open("data/splits/pdbids_2019", "r") as f:
136
+ lines = f.readlines()
137
+ pdbids_from2019 = []
138
+ for i in range(6, len(lines), 4):
139
+ pdbids_from2019.append(lines[i][18:22])
140
+
141
+ pdbids_from2019 = set(pdbids_from2019)
142
+ len_before = len(self.ligands)
143
+ self.ligands = {k: v for k, v in self.ligands.items() if k[:4].upper() not in pdbids_from2019}
144
+ print('removed', len_before - len(self.ligands), 'ligands from 2019 out of', len_before)
145
+
146
+ if unroll_clusters:
147
+ rec_keys = set([k[:6] for k in self.ligands.keys()])
148
+ self.cluster_to_ligands = {k:[k2 for k2 in self.ligands.keys() if k2[:6] == k] for k in rec_keys}
149
+ self.split_clusters = list(rec_keys)
150
+ else:
151
+ for c in self.cluster_to_ligands.keys():
152
+ self.cluster_to_ligands[c] = [v for v in self.cluster_to_ligands[c] if v in self.ligands]
153
+ self.split_clusters = [c for c in self.split_clusters if len(self.cluster_to_ligands[c])>0]
154
+
155
+ print_statistics(self)
156
+ list_names = [name for cluster in self.split_clusters for name in self.cluster_to_ligands[cluster]]
157
+ with open(os.path.join(self.prot_cache_path, f'moad_{self.split}_names.txt'), 'w') as f:
158
+ f.write('\n'.join(list_names))
159
+
160
+ def len(self):
161
+ return len(self.split_clusters) * self.multiplicity if self.total_dataset_size is None else self.total_dataset_size
162
+
163
+ def get_by_name(self, ligand_name, cluster):
164
+ ligand_graph = copy.deepcopy(self.ligands[ligand_name])
165
+ complex_graph = copy.deepcopy(self.receptors[ligand_name[:6]])
166
+
167
+ if False and self.keep_original and hasattr(ligand_graph['ligand'], 'orig_pos'):
168
+ lig_path = os.path.join(self.moad_dir, 'pdb_superligand', ligand_name + '.pdb')
169
+ lig = Chem.MolFromPDBFile(lig_path)
170
+ formula = np.asarray([atom.GetSymbol() for atom in lig.GetAtoms()])
171
+
172
+ # check for same receptor/ligand pair with a different binding position
173
+ for ligand_comp in self.cluster_to_ligands[cluster]:
174
+ if ligand_comp == ligand_name or ligand_comp[:6] != ligand_name[:6]:
175
+ continue
176
+
177
+ lig_path_comp = os.path.join(self.moad_dir, 'pdb_superligand', ligand_comp + '.pdb')
178
+ if not os.path.exists(lig_path_comp):
179
+ continue
180
+
181
+ lig_comp = Chem.MolFromPDBFile(lig_path_comp)
182
+ formula_comp = np.asarray([atom.GetSymbol() for atom in lig_comp.GetAtoms()])
183
+
184
+ if formula.shape == formula_comp.shape and np.all(formula == formula_comp) and hasattr(
185
+ self.ligands[ligand_comp], 'orig_pos'):
186
+ print(f'Found complex {ligand_comp} to have the same complex/ligand pair, adding it into orig_pos')
187
+ # add the orig_pos of the binding position
188
+ if not isinstance(ligand_graph['ligand'].orig_pos, list):
189
+ ligand_graph['ligand'].orig_pos = [ligand_graph['ligand'].orig_pos]
190
+ ligand_graph['ligand'].orig_pos.append(self.ligands[ligand_comp].orig_pos)
191
+
192
+ for type in ligand_graph.node_types + ligand_graph.edge_types:
193
+ for key, value in ligand_graph[type].items():
194
+ complex_graph[type][key] = value
195
+ complex_graph.name = ligand_graph.name
196
+ if isinstance(complex_graph['ligand'].pos, list):
197
+ for i in range(len(complex_graph['ligand'].pos)):
198
+ complex_graph['ligand'].pos[i] -= complex_graph.original_center
199
+ else:
200
+ complex_graph['ligand'].pos -= complex_graph.original_center
201
+ if self.require_ligand:
202
+ complex_graph.mol = copy.deepcopy(self.rdkit_ligands[ligand_name])
203
+
204
+ if self.chain_cutoff:
205
+ distances = torch.norm(
206
+ (torch.from_numpy(complex_graph['ligand'].orig_pos[0]) - complex_graph.original_center).unsqueeze(1) - complex_graph['receptor'].pos.unsqueeze(0), dim=2)
207
+ distances = distances.min(dim=0)[0]
208
+ if torch.min(distances) >= self.chain_cutoff:
209
+ print('minimum distance', torch.min(distances), 'too large', ligand_name,
210
+ 'skipping and returning random. Number of chains',
211
+ torch.max(complex_graph['receptor'].chain_ids) + 1)
212
+ return self.get(random.randint(0, self.len()))
213
+
214
+ within_cutoff = distances < self.chain_cutoff
215
+ chains_within_cutoff = torch.zeros(torch.max(complex_graph['receptor'].chain_ids) + 1)
216
+ chains_within_cutoff.index_add_(0, complex_graph['receptor'].chain_ids, within_cutoff.float())
217
+ chains_within_cutoff_bool = chains_within_cutoff > 0
218
+ residues_to_keep = chains_within_cutoff_bool[complex_graph['receptor'].chain_ids]
219
+
220
+ if self.all_atoms:
221
+ atom_to_res_mapping = complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index[1]
222
+ atoms_to_keep = residues_to_keep[atom_to_res_mapping]
223
+ rec_remapper = (torch.cumsum(residues_to_keep.long(), dim=0) - 1)
224
+ atom_to_res_new_mapping = rec_remapper[atom_to_res_mapping][atoms_to_keep]
225
+ atom_res_edge_index = torch.stack([torch.arange(len(atom_to_res_new_mapping)), atom_to_res_new_mapping])
226
+
227
+ complex_graph['atom'].x = complex_graph['atom'].x[atoms_to_keep]
228
+ complex_graph['atom'].pos = complex_graph['atom'].pos[atoms_to_keep]
229
+ complex_graph['atom', 'atom_contact', 'atom'].edge_index = \
230
+ subgraph(atoms_to_keep, complex_graph['atom', 'atom_contact', 'atom'].edge_index,
231
+ relabel_nodes=True)[0]
232
+ complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index = atom_res_edge_index
233
+
234
+ complex_graph['receptor'].pos = complex_graph['receptor'].pos[residues_to_keep]
235
+ complex_graph['receptor'].x = complex_graph['receptor'].x[residues_to_keep]
236
+ complex_graph['receptor'].side_chain_vecs = complex_graph['receptor'].side_chain_vecs[residues_to_keep]
237
+ complex_graph['receptor', 'rec_contact', 'receptor'].edge_index = \
238
+ subgraph(residues_to_keep, complex_graph['receptor', 'rec_contact', 'receptor'].edge_index,
239
+ relabel_nodes=True)[0]
240
+
241
+ extra_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
242
+ complex_graph['receptor'].pos -= extra_center
243
+ if isinstance(complex_graph['ligand'].pos, list):
244
+ for i in range(len(complex_graph['ligand'].pos)):
245
+ complex_graph['ligand'].pos[i] -= extra_center
246
+ else:
247
+ complex_graph['ligand'].pos -= extra_center
248
+ complex_graph.original_center += extra_center
249
+
250
+ complex_graph['receptor'].pop('chain_ids')
251
+
252
+ for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq',
253
+ 'to_keep', 'chain_ids']:
254
+ if hasattr(complex_graph, a):
255
+ delattr(complex_graph, a)
256
+ if hasattr(complex_graph['receptor'], a):
257
+ delattr(complex_graph['receptor'], a)
258
+
259
+ return complex_graph
260
+
261
+ def get(self, idx):
262
+ if self.total_dataset_size is not None:
263
+ idx = random.randint(0, len(self.split_clusters) - 1)
264
+
265
+ idx = idx % len(self.split_clusters)
266
+ cluster = self.split_clusters[idx]
267
+
268
+ if self.no_randomness:
269
+ ligand_name = sorted(self.cluster_to_ligands[cluster])[0]
270
+ else:
271
+ ligand_name = random.choice(self.cluster_to_ligands[cluster])
272
+
273
+ complex_graph = self.get_by_name(ligand_name, cluster)
274
+
275
+ if self.total_dataset_size is not None:
276
+ complex_graph = Batch.from_data_list([complex_graph])
277
+
278
+ return complex_graph
279
+
280
+ def get_all_complexes(self):
281
+ complexes = {}
282
+ for cluster in self.split_clusters:
283
+ for ligand_name in self.cluster_to_ligands[cluster]:
284
+ complexes[ligand_name] = self.get_by_name(ligand_name, cluster)
285
+ return complexes
286
+
287
+ def preprocessing_receptors(self):
288
+ print(f'Processing receptors from [{self.split}] and saving it to [{self.prot_cache_path}]')
289
+
290
+ complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
291
+ if self.limit_complexes is not None and self.limit_complexes != 0:
292
+ complex_names_all = complex_names_all[:self.limit_complexes]
293
+
294
+ receptor_names_all = [l[:6] for l in complex_names_all]
295
+ receptor_names_all = sorted(list(dict.fromkeys(receptor_names_all)))
296
+ print(f'Loading {len(receptor_names_all)} receptors.')
297
+
298
+ if self.esm_embeddings_path is not None:
299
+ id_to_embeddings = torch.load(self.esm_embeddings_path)
300
+ sequences_list = read_strings_from_txt(self.esm_embeddings_sequences_path)
301
+ sequences_to_embeddings = {}
302
+ for i, seq in enumerate(sequences_list):
303
+ sequences_to_embeddings[seq] = id_to_embeddings[str(i)]
304
+
305
+ else:
306
+ sequences_to_embeddings = None
307
+
308
+ # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
309
+ list_indices = list(range(len(receptor_names_all)//1000+1))
310
+ random.shuffle(list_indices)
311
+ for i in list_indices:
312
+ if os.path.exists(os.path.join(self.prot_cache_path, f"receptors{i}.pkl")):
313
+ continue
314
+ receptor_names = receptor_names_all[1000*i:1000*(i+1)]
315
+ receptor_graphs = []
316
+ if self.num_workers > 1:
317
+ p = Pool(self.num_workers, maxtasksperchild=1)
318
+ p.__enter__()
319
+ with tqdm(total=len(receptor_names), desc=f'loading receptors {i}/{len(receptor_names_all)//1000+1}') as pbar:
320
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
321
+ for t in map_fn(self.get_receptor, zip(receptor_names, [sequences_to_embeddings]*len(receptor_names))):
322
+ if t is not None:
323
+ print(len(receptor_graphs))
324
+ receptor_graphs.append(t)
325
+ pbar.update()
326
+ if self.num_workers > 1: p.__exit__(None, None, None)
327
+
328
+ print('Number of receptors: ', len(receptor_graphs))
329
+ with open(os.path.join(self.prot_cache_path, f"receptors{i}.pkl"), 'wb') as f:
330
+ pickle.dump((receptor_graphs), f)
331
+ return receptor_names_all
332
+
333
+ def check_all_receptors(self):
334
+ complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
335
+ if self.limit_complexes is not None and self.limit_complexes != 0:
336
+ complex_names_all = complex_names_all[:self.limit_complexes]
337
+ receptor_names_all = [l[:6] for l in complex_names_all]
338
+ receptor_names_all = list(dict.fromkeys(receptor_names_all))
339
+ for i in range(len(receptor_names_all)//1000+1):
340
+ if not os.path.exists(os.path.join(self.prot_cache_path, f"receptors{i}.pkl")):
341
+ return False
342
+ return True
343
+
344
+ def collect_receptors(self, receptors_to_keep=None, max_receptor_size=None, remove_promiscuous_targets=None):
345
+ complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
346
+ if self.limit_complexes is not None and self.limit_complexes != 0:
347
+ complex_names_all = complex_names_all[:self.limit_complexes]
348
+ receptor_names_all = [l[:6] for l in complex_names_all]
349
+ receptor_names_all = sorted(list(dict.fromkeys(receptor_names_all)))
350
+
351
+ receptor_graphs_all = []
352
+ total_recovered = 0
353
+ print(f'Loading {len(receptor_names_all)} receptors to keep {len(receptors_to_keep)}.')
354
+ for i in range(len(receptor_names_all)//1000+1):
355
+ print(f'prot path: {os.path.join(self.prot_cache_path, f"receptors{i}.pkl")}')
356
+ with open(os.path.join(self.prot_cache_path, f"receptors{i}.pkl"), 'rb') as f:
357
+ l = pickle.load(f)
358
+ total_recovered += len(l)
359
+ if receptors_to_keep is not None:
360
+ l = [t for t in l if t['receptor_name'] in receptors_to_keep]
361
+ receptor_graphs_all.extend(l)
362
+
363
+ cur_len = len(receptor_graphs_all)
364
+ print(f"Kept {len(receptor_graphs_all)} receptors out of {len(receptor_names_all)} total and recovered {total_recovered}")
365
+
366
+ if max_receptor_size is not None:
367
+ receptor_graphs_all = [rec for rec in receptor_graphs_all if rec["receptor"].pos.shape[0] <= max_receptor_size]
368
+ print(f"Kept {len(receptor_graphs_all)} receptors out of {cur_len} after filtering by size")
369
+ cur_len = len(receptor_graphs_all)
370
+
371
+ if remove_promiscuous_targets is not None:
372
+ promiscuous_targets = set()
373
+ for name in complex_names_all:
374
+ l = name.split('_')
375
+ if int(l[3]) > remove_promiscuous_targets:
376
+ promiscuous_targets.add(name[:6])
377
+ receptor_graphs_all = [rec for rec in receptor_graphs_all if rec["receptor_name"] not in promiscuous_targets]
378
+ print(f"Kept {len(receptor_graphs_all)} receptors out of {cur_len} after removing promiscuous targets")
379
+
380
+ self.receptors = {}
381
+ for r in receptor_graphs_all:
382
+ self.receptors[r['receptor_name']] = r
383
+ return
384
+
385
+ def get_receptor(self, par):
386
+ name, sequences_to_embeddings = par
387
+ rec_path = os.path.join(self.moad_dir, 'pdb_protein', name + '_protein.pdb')
388
+ if not os.path.exists(rec_path):
389
+ print("Receptor not found", name, rec_path)
390
+ return None
391
+
392
+ complex_graph = HeteroData()
393
+ complex_graph['receptor_name'] = name
394
+ try:
395
+ moad_extract_receptor_structure(path=rec_path, complex_graph=complex_graph, neighbor_cutoff=self.receptor_radius,
396
+ max_neighbors=self.c_alpha_max_neighbors, sequences_to_embeddings=sequences_to_embeddings,
397
+ knn_only_graph=self.knn_only_graph, all_atoms=self.all_atoms, atom_cutoff=self.atom_radius,
398
+ atom_max_neighbors=self.atom_max_neighbors)
399
+
400
+ except Exception as e:
401
+ print(f'Skipping {name} because of the error:')
402
+ print(e)
403
+ return None
404
+
405
+ protein_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
406
+ complex_graph['receptor'].pos -= protein_center
407
+ if self.all_atoms:
408
+ complex_graph['atom'].pos -= protein_center
409
+ complex_graph.original_center = protein_center
410
+ return complex_graph
411
+
412
+
413
+ def preprocessing_ligands(self):
414
+ print(f'Processing complexes from [{self.split}] and saving it to [{self.lig_cache_path}]')
415
+
416
+ complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
417
+ if self.limit_complexes is not None and self.limit_complexes != 0:
418
+ complex_names_all = complex_names_all[:self.limit_complexes]
419
+ print(f'Loading {len(complex_names_all)} ligands.')
420
+
421
+ # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
422
+ list_indices = list(range(len(complex_names_all)//1000+1))
423
+ random.shuffle(list_indices)
424
+ for i in list_indices:
425
+ if os.path.exists(os.path.join(self.lig_cache_path, f"ligands{i}.pkl")):
426
+ continue
427
+ complex_names = complex_names_all[1000*i:1000*(i+1)]
428
+ ligand_graphs, rdkit_ligands = [], []
429
+ if self.num_workers > 1:
430
+ p = Pool(self.num_workers, maxtasksperchild=1)
431
+ p.__enter__()
432
+ with tqdm(total=len(complex_names), desc=f'loading complexes {i}/{len(complex_names_all)//1000+1}') as pbar:
433
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
434
+ for t in map_fn(self.get_ligand, complex_names):
435
+ if t is not None:
436
+ ligand_graphs.append(t[0])
437
+ rdkit_ligands.append(t[1])
438
+ pbar.update()
439
+ if self.num_workers > 1: p.__exit__(None, None, None)
440
+
441
+ with open(os.path.join(self.lig_cache_path, f"ligands{i}.pkl"), 'wb') as f:
442
+ pickle.dump((ligand_graphs), f)
443
+ with open(os.path.join(self.lig_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
444
+ pickle.dump((rdkit_ligands), f)
445
+
446
+ ligand_graphs_all = []
447
+ for i in range(len(complex_names_all)//1000+1):
448
+ with open(os.path.join(self.lig_cache_path, f"ligands{i}.pkl"), 'rb') as f:
449
+ l = pickle.load(f)
450
+ ligand_graphs_all.extend(l)
451
+ with open(os.path.join(self.lig_cache_path, f"ligands.pkl"), 'wb') as f:
452
+ pickle.dump((ligand_graphs_all), f)
453
+
454
+ rdkit_ligands_all = []
455
+ for i in range(len(complex_names_all) // 1000 + 1):
456
+ with open(os.path.join(self.lig_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
457
+ l = pickle.load(f)
458
+ rdkit_ligands_all.extend(l)
459
+ with open(os.path.join(self.lig_cache_path, f"rdkit_ligands.pkl"), 'wb') as f:
460
+ pickle.dump((rdkit_ligands_all), f)
461
+
462
+ def get_ligand(self, name):
463
+ if self.split == 'train':
464
+ lig_path = os.path.join(self.moad_dir, 'pdb_superligand', name + '.pdb')
465
+ else:
466
+ lig_path = os.path.join(self.moad_dir, 'pdb_ligand', name + '.pdb')
467
+
468
+ if not os.path.exists(lig_path):
469
+ print("Ligand not found", name, lig_path)
470
+ return None
471
+
472
+ # read pickle
473
+ lig = Chem.MolFromPDBFile(lig_path)
474
+
475
+ if self.max_lig_size is not None and lig.GetNumHeavyAtoms() > self.max_lig_size:
476
+ print(f'Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data.')
477
+ return None
478
+
479
+ try:
480
+ if self.matching:
481
+ smile = Chem.MolToSmiles(lig)
482
+ if '.' in smile:
483
+ print(f'Ligand {name} has multiple fragments and we are doing matching. Not including {name} in preprocessed data.')
484
+ return None
485
+
486
+ complex_graph = HeteroData()
487
+ complex_graph['name'] = name
488
+
489
+ Chem.SanitizeMol(lig)
490
+ get_lig_graph_with_matching(lig, complex_graph, self.popsize, self.maxiter, self.matching, self.keep_original,
491
+ self.num_conformers, remove_hs=self.remove_hs, tries=self.matching_tries, skip_matching=self.skip_matching)
492
+ except Exception as e:
493
+ print(f'Skipping {name} because of the error:')
494
+ print(e)
495
+ return None
496
+
497
+ if self.split != 'train':
498
+ other_positions = [complex_graph['ligand'].orig_pos]
499
+ nsplit = name.split('_')
500
+ for i in range(100):
501
+ new_file = os.path.join(self.moad_dir, 'pdb_ligand', f'{nsplit[0]}_{nsplit[1]}_{nsplit[2]}_{i}.pdb')
502
+ if os.path.exists(new_file):
503
+ if i != int(nsplit[3]):
504
+ lig = Chem.MolFromPDBFile(new_file)
505
+ lig = RemoveHs(lig, sanitize=True)
506
+ other_positions.append(lig.GetConformer().GetPositions())
507
+ else:
508
+ break
509
+ complex_graph['ligand'].orig_pos = np.asarray(other_positions)
510
+
511
+ return complex_graph, lig
512
+
513
+
514
+ def print_statistics(dataset):
515
+ statistics = ([], [], [], [], [], [])
516
+ receptor_sizes = []
517
+
518
+ for i in range(len(dataset)):
519
+ complex_graph = dataset[i]
520
+ lig_pos = complex_graph['ligand'].pos if torch.is_tensor(complex_graph['ligand'].pos) else complex_graph['ligand'].pos[0]
521
+ receptor_sizes.append(complex_graph['receptor'].pos.shape[0])
522
+ radius_protein = torch.max(torch.linalg.vector_norm(complex_graph['receptor'].pos, dim=1))
523
+ molecule_center = torch.mean(lig_pos, dim=0)
524
+ radius_molecule = torch.max(
525
+ torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1))
526
+ distance_center = torch.linalg.vector_norm(molecule_center)
527
+ statistics[0].append(radius_protein)
528
+ statistics[1].append(radius_molecule)
529
+ statistics[2].append(distance_center)
530
+ if "rmsd_matching" in complex_graph:
531
+ statistics[3].append(complex_graph.rmsd_matching)
532
+ else:
533
+ statistics[3].append(0)
534
+ statistics[4].append(int(complex_graph.random_coords) if "random_coords" in complex_graph else -1)
535
+ if "random_coords" in complex_graph and complex_graph.random_coords and "rmsd_matching" in complex_graph:
536
+ statistics[5].append(complex_graph.rmsd_matching)
537
+
538
+ if len(statistics[5]) == 0:
539
+ statistics[5].append(-1)
540
+ name = ['radius protein', 'radius molecule', 'distance protein-mol', 'rmsd matching', 'random coordinates', 'random rmsd matching']
541
+ print('Number of complexes: ', len(dataset))
542
+ for i in range(len(name)):
543
+ array = np.asarray(statistics[i])
544
+ print(f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}")
545
+
546
+ return
547
+
datasets/parse_chi.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From Nick Polizzi
2
+ import numpy as np
3
+ from collections import defaultdict
4
+ import prody as pr
5
+ import os
6
+
7
+ from datasets.constants import chi, atom_order, aa_long2short, aa_short2aa_idx, aa_idx2aa_short
8
+
9
+
10
+ def get_dihedral_indices(resname, chi_num):
11
+ """Return the atom indices for the specified dihedral angle.
12
+ """
13
+ if resname not in chi:
14
+ return np.array([np.nan]*4)
15
+ if chi_num not in chi[resname]:
16
+ return np.array([np.nan]*4)
17
+ return np.array([atom_order[resname].index(x) for x in chi[resname][chi_num]])
18
+
19
+
20
+ dihedral_indices = defaultdict(list)
21
+ for aa in atom_order.keys():
22
+ for i in range(1, 5):
23
+ inds = get_dihedral_indices(aa, i)
24
+ dihedral_indices[aa].append(inds)
25
+ dihedral_indices[aa] = np.array(dihedral_indices[aa])
26
+
27
+
28
+ def vector_batch(a, b):
29
+ return a - b
30
+
31
+
32
+ def unit_vector_batch(v):
33
+ return v / np.linalg.norm(v, axis=1, keepdims=True)
34
+
35
+
36
+ def dihedral_angle_batch(p):
37
+ b0 = vector_batch(p[:, 0], p[:, 1])
38
+ b1 = vector_batch(p[:, 1], p[:, 2])
39
+ b2 = vector_batch(p[:, 2], p[:, 3])
40
+
41
+ n1 = np.cross(b0, b1)
42
+ n2 = np.cross(b1, b2)
43
+
44
+ m1 = np.cross(n1, b1 / np.linalg.norm(b1, axis=1, keepdims=True))
45
+
46
+ x = np.sum(n1 * n2, axis=1)
47
+ y = np.sum(m1 * n2, axis=1)
48
+
49
+ deg = np.degrees(np.arctan2(y, x))
50
+
51
+ deg[deg < 0] += 360
52
+
53
+ return deg
54
+
55
+
56
+ def batch_compute_dihedral_angles(sidechains):
57
+ sidechains_np = np.array(sidechains)
58
+ dihedral_angles = dihedral_angle_batch(sidechains_np)
59
+ return dihedral_angles
60
+
61
+
62
+ def get_coords(prody_pdb):
63
+ resindices = sorted(set(prody_pdb.ca.getResindices()))
64
+ coords = np.full((len(resindices), 14, 3), np.nan)
65
+ for i, resind in enumerate(resindices):
66
+ sel = prody_pdb.select(f'resindex {resind}')
67
+ resname = sel.getResnames()[0]
68
+ for j, name in enumerate(atom_order[aa_long2short[resname] if resname in aa_long2short else 'X']):
69
+ sel_resnum_name = sel.select(f'name {name}')
70
+ if sel_resnum_name is not None:
71
+ coords[i, j, :] = sel_resnum_name.getCoords()[0]
72
+ else:
73
+ coords[i, j, :] = [np.nan, np.nan, np.nan]
74
+ return coords
75
+
76
+
77
+ def get_onehot_sequence(seq):
78
+ onehot = np.zeros((len(seq), 20))
79
+ for i, aa in enumerate(seq):
80
+ idx = aa_short2aa_idx[aa] if aa in aa_short2aa_idx else 7 # 7 is the index for GLY
81
+ onehot[i, idx] = 1
82
+ return onehot
83
+
84
+
85
+ def get_dihedral_indices(onehot_sequence):
86
+ return np.array([dihedral_indices[aa_idx2aa_short[aa_idx]] for aa_idx in np.where(onehot_sequence)[1]])
87
+
88
+
89
+ def _get_chi_angles(coords, indices):
90
+ X = coords
91
+ Y = indices.astype(int)
92
+ N = coords.shape[0]
93
+ mask = np.isnan(indices)
94
+ Y[mask] = 0
95
+ Z = X[np.arange(N)[:, None, None], Y, :]
96
+ Z[mask] = np.nan
97
+ chi_angles = batch_compute_dihedral_angles(Z.reshape(-1, 4, 3)).reshape(N, 4)
98
+ return chi_angles
99
+
100
+
101
+ def get_chi_angles(coords, seq, return_onehot=False):
102
+ """
103
+
104
+ Parameters
105
+ ----------
106
+ prody_pdb : prody.AtomGroup
107
+ prody pdb object or selection
108
+ return_coords : bool, optional
109
+ return coordinates of prody_pdb in (N, 14, 3) array format, by default False
110
+ return_onehot : bool, optional
111
+ return one-hot sequence of prody_pdb, by default False
112
+
113
+ Returns
114
+ -------
115
+ numpy array of shape (N, 4)
116
+ Array contains chi angles of sidechains in row-order of residue indices in prody_pdb.
117
+ If a chi angle is not defined for a residue, due to missing atoms or GLY / ALA, it is set to np.nan.
118
+ """
119
+ onehot = get_onehot_sequence(seq)
120
+ dihedral_indices = get_dihedral_indices(onehot)
121
+ if return_onehot:
122
+ return _get_chi_angles(coords, dihedral_indices), onehot
123
+ return _get_chi_angles(coords, dihedral_indices)
124
+
125
+
126
+ def test_get_chi_angles(print_chi_angles=False):
127
+ # need internet connection of '6w70.pdb' in working directory
128
+ pdb = pr.parsePDB('6w70')
129
+ prody_pdb = pdb.select('chain A')
130
+ chi_angles = get_chi_angles(prody_pdb)
131
+ assert chi_angles.shape == (prody_pdb.ca.numAtoms(), 4)
132
+ assert chi_angles[0,0] < 56.0 and chi_angles[0,0] > 55.0
133
+ print('test_get_chi_angles passed')
134
+ try:
135
+ os.remove('6w70.pdb.gz')
136
+ except:
137
+ pass
138
+ if print_chi_angles:
139
+ print(chi_angles)
140
+ return True
141
+
142
+
143
+ if __name__ == '__main__':
144
+ test_get_chi_angles(print_chi_angles=True)
145
+
146
+
datasets/pdb.py ADDED
@@ -0,0 +1,536 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Significant contribution from Ben Fry
2
+
3
+ import copy
4
+ import os.path
5
+ import pickle
6
+ import random
7
+ from multiprocessing import Pool
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ import torch
12
+ from rdkit import Chem
13
+ from rdkit.Chem import AllChem, MolFromSmiles
14
+ from scipy.spatial.distance import pdist, squareform
15
+ from torch_geometric.data import Dataset, HeteroData
16
+ from torch_geometric.utils import subgraph
17
+ from tqdm import tqdm
18
+
19
+ from datasets.constants import aa_to_cg_indices, amino_acid_smiles, cg_rdkit_indices
20
+ from datasets.parse_chi import aa_long2short, atom_order
21
+ from datasets.process_mols import new_extract_receptor_structure, get_lig_graph, generate_conformer
22
+ from utils.torsion import get_transformation_mask
23
+
24
+
25
+ def read_strings_from_txt(path):
26
+ # every line will be one element of the returned list
27
+ with open(path) as file:
28
+ lines = file.readlines()
29
+ return [line.rstrip() for line in lines]
30
+
31
+
32
+ def compute_num_ca_neighbors(coords, cg_coords, idx, is_valid_bb_node, max_dist=5, buffer_residue_num=7):
33
+ """
34
+ Counts number of residues with heavy atoms within max_dist (Angstroms) of this sidechain that are not
35
+ residues within +/- buffer_residue_num in primary sequence.
36
+ From Ben's code
37
+ Note: Gabriele removed the chain_index
38
+ """
39
+
40
+ # Extract coordinates of all residues in the protein.
41
+ bb_coords = coords
42
+
43
+ # Compute the indices that we should not consider interactions.
44
+ excluded_neighbors = [idx - x for x in reversed(range(0, buffer_residue_num+1)) if (idx - x) >= 0]
45
+ excluded_neighbors.extend([idx + x for x in range(1, buffer_residue_num+1)])
46
+
47
+ # Create indices of an N x M distance matrix where N is num BB nodes and M is num CG nodes.
48
+ e_idx = torch.stack([
49
+ torch.arange(bb_coords.shape[0]).unsqueeze(-1).expand((-1, cg_coords.shape[0])).flatten(),
50
+ torch.arange(cg_coords.shape[0]).unsqueeze(0).expand((bb_coords.shape[0], -1)).flatten()
51
+ ])
52
+
53
+ # Expand bb_coords and cg_coords into the same dimensionality.
54
+ bb_coords_exp = bb_coords[e_idx[0]]
55
+ cg_coords_exp = cg_coords[e_idx[1]].unsqueeze(1)
56
+
57
+ # Every row is distance of chemical group to each atom in backbone coordinate frame.
58
+ bb_exp_idces, _ = (torch.cdist(bb_coords_exp, cg_coords_exp).squeeze(-1) < max_dist).nonzero(as_tuple=True)
59
+ bb_idces_within_thresh = torch.unique(e_idx[0][bb_exp_idces])
60
+
61
+ # Only count residues that are not adjacent or origin in primary sequence and are valid backbone residues (fully resolved coordinate frame).
62
+ bb_idces_within_thresh = bb_idces_within_thresh[~torch.isin(bb_idces_within_thresh, torch.tensor(excluded_neighbors)) & is_valid_bb_node[bb_idces_within_thresh]]
63
+
64
+ return len(bb_idces_within_thresh)
65
+
66
+
67
+ def identify_valid_vandermers(args):
68
+ """
69
+ Constructs a tensor containing all the number of contacts for each residue that can be sampled from for chemical groups.
70
+ By using every sidechain as a chemical group, we will load the actual chemical groups at training time.
71
+ These can be used to sample as probabilities once divided by the sum.
72
+ """
73
+ complex_graph, max_dist, buffer_residue_num = args
74
+
75
+ # Constructs a mask tracking whether index is a valid coordinate frame / residue label to train over.
76
+ #is_in_residue_vocabulary = torch.tensor([x in aa_short2long for x in data['seq']]).bool()
77
+ coords, seq = complex_graph.coords, complex_graph.seq
78
+ is_valid_bb_node = (coords[:, :4].isnan().sum(dim=(1,2)) == 0).bool() #* is_in_residue_vocabulary
79
+
80
+ valid_cg_idces = []
81
+ for idx, aa in enumerate(seq):
82
+
83
+ if aa not in aa_to_cg_indices:
84
+ valid_cg_idces.append(0)
85
+ else:
86
+ indices = aa_to_cg_indices[aa]
87
+ cg_coordinates = coords[idx][indices]
88
+
89
+ # remove chemical group residues that aren't fully resolved.
90
+ if torch.any(cg_coordinates.isnan()).item():
91
+ valid_cg_idces.append(0)
92
+ continue
93
+
94
+ nbr_count = compute_num_ca_neighbors(coords, cg_coordinates, idx, is_valid_bb_node,
95
+ max_dist=max_dist, buffer_residue_num=buffer_residue_num)
96
+ valid_cg_idces.append(nbr_count)
97
+
98
+ return complex_graph.name, torch.tensor(valid_cg_idces)
99
+
100
+
101
+ def fast_identify_valid_vandermers(coords, seq, max_dist=5, buffer_residue_num=7):
102
+
103
+ offset = 10000 + max_dist
104
+ R = coords.shape[0]
105
+
106
+ coords = coords.numpy().reshape(-1, 3)
107
+ pdist_mat = squareform(pdist(coords))
108
+ pdist_mat = pdist_mat.reshape((R, 14, R, 14))
109
+ pdist_mat = np.nan_to_num(pdist_mat, nan=offset)
110
+ pdist_mat = np.min(pdist_mat, axis=(1, 3))
111
+
112
+ # compute pairwise distances
113
+ pdist_mat = pdist_mat + np.diag(np.ones(len(seq)) * offset)
114
+ for i in range(1, buffer_residue_num+1):
115
+ pdist_mat += np.diag(np.ones(len(seq)-i) * offset, k=i) + np.diag(np.ones(len(seq)-i) * offset, k=-i)
116
+
117
+ # get number of residues that are within max_dist of each other
118
+ nbr_count = np.sum(pdist_mat < max_dist, axis=1)
119
+ return torch.tensor(nbr_count)
120
+
121
+
122
+ def compute_cg_features(aa, aa_smile):
123
+ """
124
+ Given an amino acid and a smiles string returns the stacked tensor of chemical group atom encodings.
125
+ The order of the output tensor rows corresponds to the index the atoms appear in aa_to_cg_indices from constants.
126
+ """
127
+
128
+ # Handle any residues that we don't have chemical groups for (ex: GLY if not using bb_cnh and bb_cco)
129
+ aa_short = aa_long2short[aa]
130
+ if aa_short not in aa_to_cg_indices:
131
+ return None
132
+
133
+ # Create rdkit molecule from smiles string.
134
+ mol = Chem.MolFromSmiles(aa_smile)
135
+
136
+ complex_graph = HeteroData()
137
+ get_lig_graph(mol, complex_graph)
138
+
139
+ atoms_to_keep = torch.tensor([i for i, _ in cg_rdkit_indices[aa].items()]).long()
140
+ complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr = \
141
+ subgraph(atoms_to_keep, complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr, relabel_nodes=True)
142
+ complex_graph['ligand'].x = complex_graph['ligand'].x[atoms_to_keep]
143
+
144
+ edge_mask, mask_rotate = get_transformation_mask(complex_graph)
145
+ complex_graph['ligand'].edge_mask = torch.tensor(edge_mask)
146
+ complex_graph['ligand'].mask_rotate = mask_rotate
147
+ return complex_graph
148
+
149
+
150
+ class PDBSidechain(Dataset):
151
+ def __init__(self, root, transform=None, cache_path='data/cache', split='train', limit_complexes=0,
152
+ receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, remove_hs=True, all_atoms=False,
153
+ atom_radius=5, atom_max_neighbors=None, sequences_to_embeddings=None,
154
+ knn_only_graph=True, multiplicity=1, vandermers_max_dist=5, vandermers_buffer_residue_num=7,
155
+ vandermers_min_contacts=5, remove_second_segment=False, merge_clusters=1, vandermers_extraction=True,
156
+ add_random_ligand=False):
157
+
158
+ super(PDBSidechain, self).__init__(root, transform)
159
+ assert remove_hs == True, "not implemented yet"
160
+ self.root = root
161
+ self.split = split
162
+ self.limit_complexes = limit_complexes
163
+ self.receptor_radius = receptor_radius
164
+ self.knn_only_graph = knn_only_graph
165
+ self.multiplicity = multiplicity
166
+ self.c_alpha_max_neighbors = c_alpha_max_neighbors
167
+ self.num_workers = num_workers
168
+ self.sequences_to_embeddings = sequences_to_embeddings
169
+ self.remove_second_segment = remove_second_segment
170
+ self.merge_clusters = merge_clusters
171
+ self.vandermers_extraction = vandermers_extraction
172
+ self.add_random_ligand = add_random_ligand
173
+ self.all_atoms = all_atoms
174
+ self.atom_radius = atom_radius
175
+ self.atom_max_neighbors = atom_max_neighbors
176
+
177
+ if vandermers_extraction:
178
+ self.cg_node_feature_lookup_dict = {aa_long2short[aa]: compute_cg_features(aa, aa_smile) for aa, aa_smile in
179
+ amino_acid_smiles.items()}
180
+
181
+ self.cache_path = os.path.join(cache_path, f'PDB3_limit{self.limit_complexes}_INDEX{self.split}'
182
+ f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
183
+ + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
184
+ + ('' if not self.knn_only_graph else '_knnOnly'))
185
+ self.read_split()
186
+
187
+ if not self.check_all_proteins():
188
+ os.makedirs(self.cache_path, exist_ok=True)
189
+ self.preprocess()
190
+
191
+ self.vandermers_max_dist = vandermers_max_dist
192
+ self.vandermers_buffer_residue_num = vandermers_buffer_residue_num
193
+ self.vandermers_min_contacts = vandermers_min_contacts
194
+ self.collect_proteins()
195
+
196
+ filtered_proteins = []
197
+ if vandermers_extraction:
198
+ for complex_graph in tqdm(self.protein_graphs):
199
+ if complex_graph.name in self.vandermers and torch.any(self.vandermers[complex_graph.name] >= 10):
200
+ filtered_proteins.append(complex_graph)
201
+ print(f"Computed vandermers and kept {len(filtered_proteins)} proteins out of {len(self.protein_graphs)}")
202
+ else:
203
+ filtered_proteins = self.protein_graphs
204
+
205
+ second_filter = []
206
+ for complex_graph in tqdm(filtered_proteins):
207
+ if sequences_to_embeddings is None or complex_graph.orig_seq in sequences_to_embeddings:
208
+ second_filter.append(complex_graph)
209
+ print(f"Checked embeddings available and kept {len(second_filter)} proteins out of {len(filtered_proteins)}")
210
+
211
+ self.protein_graphs = second_filter
212
+
213
+ # filter clusters that have no protein graphs
214
+ self.split_clusters = list(set([g.cluster for g in self.protein_graphs]))
215
+ self.cluster_to_complexes = {c: [] for c in self.split_clusters}
216
+ for p in self.protein_graphs:
217
+ self.cluster_to_complexes[p['cluster']].append(p)
218
+ self.split_clusters = [c for c in self.split_clusters if len(self.cluster_to_complexes[c]) > 0]
219
+ print("Total elements in set", len(self.split_clusters) * self.multiplicity // self.merge_clusters)
220
+
221
+ self.name_to_complex = {p.name: p for p in self.protein_graphs}
222
+ self.define_probabilities()
223
+
224
+ if self.add_random_ligand:
225
+ # read csv with all smiles
226
+ with open('data/smiles_list.csv', 'r') as f:
227
+ self.smiles_list = f.readlines()
228
+ self.smiles_list = [s.split(',')[0] for s in self.smiles_list]
229
+
230
+ def define_probabilities(self):
231
+ if not self.vandermers_extraction:
232
+ return
233
+
234
+ if self.vandermers_min_contacts is not None:
235
+ self.probabilities = torch.arange(1000) - self.vandermers_min_contacts + 1
236
+ self.probabilities[:self.vandermers_min_contacts] = 0
237
+ else:
238
+ with open('data/pdbbind_counts.pkl', 'rb') as f:
239
+ pdbbind_counts = pickle.load(f)
240
+
241
+ pdb_counts = torch.ones(1000)
242
+ for contacts in self.vandermers.values():
243
+ pdb_counts.index_add_(0, contacts, torch.ones(contacts.shape))
244
+ print(pdbbind_counts[:30])
245
+ print(pdb_counts[:30])
246
+
247
+ self.probabilities = pdbbind_counts / pdb_counts
248
+ self.probabilities[:7] = 0
249
+
250
+ def len(self):
251
+ return len(self.split_clusters) * self.multiplicity // self.merge_clusters
252
+
253
+ def get(self, idx=None, protein=None, smiles=None):
254
+ assert idx is not None or (protein is not None and smiles is not None), "provide idx or protein or smile"
255
+
256
+ if protein is None or smiles is None:
257
+ idx = idx % len(self.split_clusters)
258
+ if self.merge_clusters > 1:
259
+ idx = idx * self.merge_clusters
260
+ idx = idx + random.randint(0, self.merge_clusters - 1)
261
+ idx = min(idx, len(self.split_clusters) - 1)
262
+ cluster = self.split_clusters[idx]
263
+ protein_graph = copy.deepcopy(random.choice(self.cluster_to_complexes[cluster]))
264
+ else:
265
+ protein_graph = copy.deepcopy(self.name_to_complex[protein])
266
+
267
+ if self.sequences_to_embeddings is not None:
268
+ #print(self.sequences_to_embeddings[protein_graph.orig_seq].shape, len(protein_graph.orig_seq), protein_graph.to_keep.shape)
269
+ if len(protein_graph.orig_seq) != len(self.sequences_to_embeddings[protein_graph.orig_seq]):
270
+ print('problem with ESM embeddings')
271
+ return self.get(random.randint(0, self.len()))
272
+
273
+ lm_embeddings = self.sequences_to_embeddings[protein_graph.orig_seq][protein_graph.to_keep]
274
+ protein_graph['receptor'].x = torch.cat([protein_graph['receptor'].x, lm_embeddings], dim=1)
275
+
276
+ if self.vandermers_extraction:
277
+ # select sidechain to remove
278
+ vandermers_contacts = self.vandermers[protein_graph.name]
279
+ vandermers_probs = self.probabilities[vandermers_contacts].numpy()
280
+
281
+ if not np.any(vandermers_contacts.numpy() >= 10):
282
+ print('no vandarmers >= 10 retrying with new one')
283
+ return self.get(random.randint(0, self.len()))
284
+
285
+ sidechain_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))
286
+
287
+ # remove part of the sequence
288
+ residues_to_keep = np.ones(len(protein_graph.seq), dtype=bool)
289
+ residues_to_keep[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
290
+ min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False
291
+
292
+ if self.remove_second_segment:
293
+ pos_idx = protein_graph['receptor'].pos[sidechain_idx]
294
+ limit_closeness = 10
295
+ far_enough = torch.sum((protein_graph['receptor'].pos - pos_idx[None, :]) ** 2, dim=-1) > limit_closeness ** 2
296
+ vandermers_probs = vandermers_probs * far_enough.float().numpy()
297
+ vandermers_probs[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
298
+ min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = 0
299
+ if np.all(vandermers_probs<=0):
300
+ print('no second vandermer available retrying with new one')
301
+ return self.get(random.randint(0, self.len()))
302
+ sc2_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))
303
+
304
+ residues_to_keep[max(0, sc2_idx - self.vandermers_buffer_residue_num):
305
+ min(sc2_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False
306
+
307
+ residues_to_keep = torch.from_numpy(residues_to_keep)
308
+ protein_graph['receptor'].pos = protein_graph['receptor'].pos[residues_to_keep]
309
+ protein_graph['receptor'].x = protein_graph['receptor'].x[residues_to_keep]
310
+ protein_graph['receptor'].side_chain_vecs = protein_graph['receptor'].side_chain_vecs[residues_to_keep]
311
+ protein_graph['receptor', 'rec_contact', 'receptor'].edge_index = \
312
+ subgraph(residues_to_keep, protein_graph['receptor', 'rec_contact', 'receptor'].edge_index, relabel_nodes=True)[0]
313
+
314
+ # create the sidechain ligand
315
+ sidechain_aa = protein_graph.seq[sidechain_idx]
316
+ ligand_graph = self.cg_node_feature_lookup_dict[sidechain_aa]
317
+ ligand_graph['ligand'].pos = protein_graph.coords[sidechain_idx][protein_graph.mask[sidechain_idx]]
318
+
319
+ for type in ligand_graph.node_types + ligand_graph.edge_types:
320
+ for key, value in ligand_graph[type].items():
321
+ protein_graph[type][key] = value
322
+
323
+ protein_graph['ligand'].orig_pos = protein_graph['ligand'].pos.numpy()
324
+ protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
325
+ protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
326
+ protein_graph['ligand'].pos = protein_graph['ligand'].pos - protein_center
327
+ protein_graph.original_center = protein_center
328
+ protein_graph['receptor_name'] = protein_graph.name
329
+ else:
330
+ protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
331
+ protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
332
+ protein_graph.original_center = protein_center
333
+ protein_graph['receptor_name'] = protein_graph.name
334
+
335
+ if self.add_random_ligand:
336
+ if smiles is not None:
337
+ mol = MolFromSmiles(smiles)
338
+ try:
339
+ generate_conformer(mol)
340
+ except Exception as e:
341
+ print("failed to generate the given ligand returning None", e)
342
+ return None
343
+ else:
344
+ success = False
345
+ while not success:
346
+ smiles = random.choice(self.smiles_list)
347
+ mol = MolFromSmiles(smiles)
348
+ try:
349
+ success = not generate_conformer(mol)
350
+ except Exception as e:
351
+ print(e, "changing ligand")
352
+
353
+ lig_graph = HeteroData()
354
+ get_lig_graph(mol, lig_graph)
355
+
356
+ edge_mask, mask_rotate = get_transformation_mask(lig_graph)
357
+ lig_graph['ligand'].edge_mask = torch.tensor(edge_mask)
358
+ lig_graph['ligand'].mask_rotate = mask_rotate
359
+ lig_graph['ligand'].smiles = smiles
360
+ lig_graph['ligand'].pos = lig_graph['ligand'].pos - torch.mean(lig_graph['ligand'].pos, dim=0, keepdim=True)
361
+
362
+ for type in lig_graph.node_types + lig_graph.edge_types:
363
+ for key, value in lig_graph[type].items():
364
+ protein_graph[type][key] = value
365
+
366
+ for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq', 'to_keep', 'chain_ids']:
367
+ if hasattr(protein_graph, a):
368
+ delattr(protein_graph, a)
369
+ if hasattr(protein_graph['receptor'], a):
370
+ delattr(protein_graph['receptor'], a)
371
+
372
+ return protein_graph
373
+
374
+ def read_split(self):
375
+ # read CSV file
376
+ df = pd.read_csv(self.root + "/list.csv")
377
+ print("Loaded list CSV file")
378
+
379
+ # get clusters and filter by split
380
+ if self.split == "train":
381
+ val_clusters = set(read_strings_from_txt(self.root + "/valid_clusters.txt"))
382
+ test_clusters = set(read_strings_from_txt(self.root + "/test_clusters.txt"))
383
+ clusters = df["CLUSTER"].unique()
384
+ clusters = [int(c) for c in clusters if c not in val_clusters and c not in test_clusters]
385
+ elif self.split == "val":
386
+ clusters = [int(s) for s in read_strings_from_txt(self.root + "/valid_clusters.txt")]
387
+ elif self.split == "test":
388
+ clusters = [int(s) for s in read_strings_from_txt(self.root + "/test_clusters.txt")]
389
+ else:
390
+ raise ValueError("Split must be train, val or test")
391
+ print(self.split, "clusters", len(clusters))
392
+ clusters = set(clusters)
393
+
394
+ self.chains_in_cluster = []
395
+ complexes_in_cluster = set()
396
+ for chain, cluster in zip(df["CHAINID"], df["CLUSTER"]):
397
+ if cluster not in clusters:
398
+ continue
399
+ # limit to one chain per complex
400
+ if chain[:4] not in complexes_in_cluster:
401
+ self.chains_in_cluster.append((chain, cluster))
402
+ complexes_in_cluster.add(chain[:4])
403
+ print("Filtered chains in cluster", len(self.chains_in_cluster))
404
+
405
+ if self.limit_complexes > 0:
406
+ self.chains_in_cluster = self.chains_in_cluster[:self.limit_complexes]
407
+
408
+ def check_all_proteins(self):
409
+ for i in range(len(self.chains_in_cluster)//10000+1):
410
+ if not os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
411
+ return False
412
+ return True
413
+
414
+ def collect_proteins(self):
415
+ self.protein_graphs = []
416
+ self.vandermers = {}
417
+ total_recovered = 0
418
+ print(f'Loading {len(self.chains_in_cluster)} protein graphs.')
419
+ list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
420
+ random.shuffle(list_indices)
421
+ for i in list_indices:
422
+ with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'rb') as f:
423
+ print(i)
424
+ l = pickle.load(f)
425
+ total_recovered += len(l)
426
+ self.protein_graphs.extend(l)
427
+
428
+ if not self.vandermers_extraction:
429
+ continue
430
+
431
+ if os.path.exists(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl')):
432
+ with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'rb') as f:
433
+ vandermers = pickle.load(f)
434
+ self.vandermers.update(vandermers)
435
+ continue
436
+
437
+ vandermers = {}
438
+ if self.num_workers > 1:
439
+ p = Pool(self.num_workers, maxtasksperchild=1)
440
+ p.__enter__()
441
+ with tqdm(total=len(l), desc=f'computing vandermers {i}') as pbar:
442
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
443
+ arguments = zip(l, [self.vandermers_max_dist] * len(l),
444
+ [self.vandermers_buffer_residue_num] * len(l))
445
+ for t in map_fn(identify_valid_vandermers, arguments):
446
+ if t is not None:
447
+ vandermers[t[0]] = t[1]
448
+ pbar.update()
449
+ if self.num_workers > 1: p.__exit__(None, None, None)
450
+
451
+ with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'wb') as f:
452
+ pickle.dump(vandermers, f)
453
+ self.vandermers.update(vandermers)
454
+
455
+ print(f"Kept {len(self.protein_graphs)} proteins out of {len(self.chains_in_cluster)} total")
456
+ return
457
+
458
+ def preprocess(self):
459
+ # running preprocessing in parallel on multiple workers and saving the progress every 10000 proteins
460
+ list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
461
+ random.shuffle(list_indices)
462
+ for i in list_indices:
463
+ if os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
464
+ continue
465
+ chains_names = self.chains_in_cluster[10000 * i:10000 * (i + 1)]
466
+ protein_graphs = []
467
+ if self.num_workers > 1:
468
+ p = Pool(self.num_workers, maxtasksperchild=1)
469
+ p.__enter__()
470
+ with tqdm(total=len(chains_names),
471
+ desc=f'loading protein batch {i}/{len(self.chains_in_cluster) // 10000 + 1}') as pbar:
472
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
473
+ for t in map_fn(self.load_chain, chains_names):
474
+ if t is not None:
475
+ protein_graphs.append(t)
476
+ pbar.update()
477
+ if self.num_workers > 1: p.__exit__(None, None, None)
478
+
479
+ with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'wb') as f:
480
+ pickle.dump(protein_graphs, f)
481
+
482
+ print("Finished preprocessing and saving protein graphs")
483
+
484
+ def load_chain(self, c):
485
+ chain, cluster = c
486
+ if not os.path.exists(self.root + f"/pdb/{chain[1:3]}/{chain}.pt"):
487
+ print("File not found", chain)
488
+ return None
489
+
490
+ data = torch.load(self.root + f"/pdb/{chain[1:3]}/{chain}.pt")
491
+ complex_graph = HeteroData()
492
+ complex_graph['name'] = chain
493
+ orig_seq = data["seq"]
494
+ coords = data["xyz"]
495
+ mask = data["mask"].bool()
496
+
497
+ # remove residues with NaN backbone coordinates
498
+ to_keep = torch.logical_not(torch.any(torch.isnan(coords[:, :4, 0]), dim=1))
499
+ coords = coords[to_keep]
500
+ seq = ''.join(np.asarray(list(orig_seq))[to_keep.numpy()].tolist())
501
+ mask = mask[to_keep]
502
+
503
+ if len(coords) == 0:
504
+ print("All coords were NaN", chain)
505
+ return None
506
+
507
+ try:
508
+ new_extract_receptor_structure(seq, coords.numpy(), complex_graph=complex_graph, neighbor_cutoff=self.receptor_radius,
509
+ max_neighbors=self.c_alpha_max_neighbors, knn_only_graph=self.knn_only_graph,
510
+ all_atoms=self.all_atoms, atom_cutoff=self.atom_radius,
511
+ atom_max_neighbors=self.atom_max_neighbors)
512
+ except Exception as e:
513
+ print("Error in extracting receptor", chain)
514
+ print(e)
515
+ return None
516
+
517
+ if torch.any(torch.isnan(complex_graph['receptor'].pos)):
518
+ print("NaN in pos receptor", chain)
519
+ return None
520
+
521
+ complex_graph.coords = coords
522
+ complex_graph.seq = seq
523
+ complex_graph.mask = mask
524
+ complex_graph.cluster = cluster
525
+ complex_graph.orig_seq = orig_seq
526
+ complex_graph.to_keep = to_keep
527
+ return complex_graph
528
+
529
+
530
+ if __name__ == "__main__":
531
+ dataset = PDBSidechain(root="data/pdb_2021aug02_sample", split="train", multiplicity=1, limit_complexes=150)
532
+ print(len(dataset))
533
+ print(dataset[0])
534
+ for p in dataset:
535
+ print(p)
536
+ pass
datasets/pdbbind.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import binascii
2
+ import glob
3
+ import os
4
+ import pickle
5
+ from collections import defaultdict
6
+ from multiprocessing import Pool
7
+ import random
8
+ import copy
9
+ import torch.nn.functional as F
10
+ import numpy as np
11
+ import torch
12
+ from rdkit import Chem
13
+ from rdkit.Chem import MolFromSmiles, AddHs
14
+ from torch_geometric.data import Dataset, HeteroData
15
+ from torch_geometric.transforms import BaseTransform
16
+ from tqdm import tqdm
17
+ from rdkit.Chem import RemoveAllHs
18
+
19
+ from datasets.process_mols import read_molecule, get_lig_graph_with_matching, generate_conformer, moad_extract_receptor_structure
20
+ from utils.diffusion_utils import modify_conformer, set_time
21
+ from utils.utils import read_strings_from_txt, crop_beyond
22
+ from utils import so3, torus
23
+
24
+
25
+ class NoiseTransform(BaseTransform):
26
+ def __init__(self, t_to_sigma, no_torsion, all_atom, alpha=1, beta=1,
27
+ include_miscellaneous_atoms=False, crop_beyond_cutoff=None, time_independent=False, rmsd_cutoff=0,
28
+ minimum_t=0, sampling_mixing_coeff=0):
29
+ self.t_to_sigma = t_to_sigma
30
+ self.no_torsion = no_torsion
31
+ self.all_atom = all_atom
32
+ self.include_miscellaneous_atoms = include_miscellaneous_atoms
33
+ self.minimum_t = minimum_t
34
+ self.mixing_coeff = sampling_mixing_coeff
35
+ self.alpha = alpha
36
+ self.beta = beta
37
+ self.crop_beyond_cutoff = crop_beyond_cutoff
38
+ self.rmsd_cutoff = rmsd_cutoff
39
+ self.time_independent = time_independent
40
+
41
+ def __call__(self, data):
42
+ t_tr, t_rot, t_tor, t = self.get_time()
43
+ return self.apply_noise(data, t_tr, t_rot, t_tor, t)
44
+
45
+ def get_time(self):
46
+ if self.time_independent:
47
+ t = np.random.beta(self.alpha, self.beta)
48
+ t_tr, t_rot, t_tor = t,t,t
49
+ else:
50
+ t = None
51
+ if self.mixing_coeff == 0:
52
+ t = np.random.beta(self.alpha, self.beta)
53
+ t = self.minimum_t + t * (1 - self.minimum_t)
54
+ else:
55
+ choice = np.random.binomial(1, self.mixing_coeff)
56
+ t1 = np.random.beta(self.alpha, self.beta)
57
+ t1 = t1 * self.minimum_t
58
+ t2 = np.random.beta(self.alpha, self.beta)
59
+ t2 = self.minimum_t + t2 * (1 - self.minimum_t)
60
+ t = choice * t1 + (1 - choice) * t2
61
+
62
+ t_tr, t_rot, t_tor = t,t,t
63
+ return t_tr, t_rot, t_tor, t
64
+
65
+ def apply_noise(self, data, t_tr, t_rot, t_tor, t, tr_update = None, rot_update=None, torsion_updates=None):
66
+ if not torch.is_tensor(data['ligand'].pos):
67
+ data['ligand'].pos = random.choice(data['ligand'].pos)
68
+
69
+ if self.time_independent:
70
+ orig_complex_graph = copy.deepcopy(data)
71
+
72
+ tr_sigma, rot_sigma, tor_sigma = self.t_to_sigma(t_tr, t_rot, t_tor)
73
+
74
+ if self.time_independent:
75
+ set_time(data, 0, 0, 0, 0, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
76
+ else:
77
+ set_time(data, t, t_tr, t_rot, t_tor, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
78
+
79
+ tr_update = torch.normal(mean=0, std=tr_sigma, size=(1, 3)) if tr_update is None else tr_update
80
+ rot_update = so3.sample_vec(eps=rot_sigma) if rot_update is None else rot_update
81
+ torsion_updates = np.random.normal(loc=0.0, scale=tor_sigma, size=data['ligand'].edge_mask.sum()) if torsion_updates is None else torsion_updates
82
+ torsion_updates = None if self.no_torsion else torsion_updates
83
+ try:
84
+ modify_conformer(data, tr_update, torch.from_numpy(rot_update).float(), torsion_updates)
85
+ except Exception as e:
86
+ print("failed modify conformer")
87
+ print(e)
88
+
89
+ if self.time_independent:
90
+ if self.no_torsion:
91
+ orig_complex_graph['ligand'].orig_pos = (orig_complex_graph['ligand'].pos.cpu().numpy() + orig_complex_graph.original_center.cpu().numpy())
92
+
93
+ filterHs = torch.not_equal(data['ligand'].x[:, 0], 0).cpu().numpy()
94
+ if isinstance(orig_complex_graph['ligand'].orig_pos, list):
95
+ orig_complex_graph['ligand'].orig_pos = orig_complex_graph['ligand'].orig_pos[0]
96
+ ligand_pos = data['ligand'].pos.cpu().numpy()[filterHs]
97
+ orig_ligand_pos = orig_complex_graph['ligand'].orig_pos[filterHs] - orig_complex_graph.original_center.cpu().numpy()
98
+ rmsd = np.sqrt(((ligand_pos - orig_ligand_pos) ** 2).sum(axis=1).mean(axis=0))
99
+ data.y = torch.tensor(rmsd < self.rmsd_cutoff).float().unsqueeze(0)
100
+ data.atom_y = data.y
101
+ return data
102
+
103
+ data.tr_score = -tr_update / tr_sigma ** 2
104
+ data.rot_score = torch.from_numpy(so3.score_vec(vec=rot_update, eps=rot_sigma)).float().unsqueeze(0)
105
+ data.tor_score = None if self.no_torsion else torch.from_numpy(torus.score(torsion_updates, tor_sigma)).float()
106
+ data.tor_sigma_edge = None if self.no_torsion else np.ones(data['ligand'].edge_mask.sum()) * tor_sigma
107
+
108
+ if data['ligand'].pos.shape[0] == 1:
109
+ # if the ligand is a single atom, the rotational score is always 0
110
+ data.rot_score = data.rot_score * 0
111
+
112
+ if self.crop_beyond_cutoff is not None:
113
+ crop_beyond(data, tr_sigma * 3 + self.crop_beyond_cutoff, self.all_atom)
114
+ set_time(data, t, t_tr, t_rot, t_tor, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
115
+ return data
116
+
117
+
118
+ class PDBBind(Dataset):
119
+ def __init__(self, root, transform=None, cache_path='data/cache', split_path='data/', limit_complexes=0, chain_cutoff=10,
120
+ receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, popsize=15, maxiter=15,
121
+ matching=True, keep_original=False, max_lig_size=None, remove_hs=False, num_conformers=1, all_atoms=False,
122
+ atom_radius=5, atom_max_neighbors=None, esm_embeddings_path=None, require_ligand=False,
123
+ include_miscellaneous_atoms=False,
124
+ protein_path_list=None, ligand_descriptions=None, keep_local_structures=False,
125
+ protein_file="protein_processed", ligand_file="ligand",
126
+ knn_only_graph=False, matching_tries=1, dataset='PDBBind'):
127
+
128
+ super(PDBBind, self).__init__(root, transform)
129
+ self.pdbbind_dir = root
130
+ self.include_miscellaneous_atoms = include_miscellaneous_atoms
131
+ self.max_lig_size = max_lig_size
132
+ self.split_path = split_path
133
+ self.limit_complexes = limit_complexes
134
+ self.chain_cutoff = chain_cutoff
135
+ self.receptor_radius = receptor_radius
136
+ self.num_workers = num_workers
137
+ self.c_alpha_max_neighbors = c_alpha_max_neighbors
138
+ self.remove_hs = remove_hs
139
+ self.esm_embeddings_path = esm_embeddings_path
140
+ self.use_old_wrong_embedding_order = False
141
+ self.require_ligand = require_ligand
142
+ self.protein_path_list = protein_path_list
143
+ self.ligand_descriptions = ligand_descriptions
144
+ self.keep_local_structures = keep_local_structures
145
+ self.protein_file = protein_file
146
+ self.fixed_knn_radius_graph = True
147
+ self.knn_only_graph = knn_only_graph
148
+ self.matching_tries = matching_tries
149
+ self.ligand_file = ligand_file
150
+ self.dataset = dataset
151
+ assert knn_only_graph or (not all_atoms)
152
+ self.all_atoms = all_atoms
153
+ if matching or protein_path_list is not None and ligand_descriptions is not None:
154
+ cache_path += '_torsion'
155
+ if all_atoms:
156
+ cache_path += '_allatoms'
157
+ self.full_cache_path = os.path.join(cache_path, f'{dataset}3_limit{self.limit_complexes}'
158
+ f'_INDEX{os.path.splitext(os.path.basename(self.split_path))[0]}'
159
+ f'_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}'
160
+ f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
161
+ f'_chainCutoff{self.chain_cutoff if self.chain_cutoff is None else int(self.chain_cutoff)}'
162
+ + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
163
+ + (''if not matching or num_conformers == 1 else f'_confs{num_conformers}')
164
+ + ('' if self.esm_embeddings_path is None else f'_esmEmbeddings')
165
+ + '_full'
166
+ + ('' if not keep_local_structures else f'_keptLocalStruct')
167
+ + ('' if protein_path_list is None or ligand_descriptions is None else str(binascii.crc32(''.join(ligand_descriptions + protein_path_list).encode())))
168
+ + ('' if protein_file == "protein_processed" else '_' + protein_file)
169
+ + ('' if not self.fixed_knn_radius_graph else (f'_fixedKNN' if not self.knn_only_graph else '_fixedKNNonly'))
170
+ + ('' if not self.include_miscellaneous_atoms else '_miscAtoms')
171
+ + ('' if self.use_old_wrong_embedding_order else '_chainOrd')
172
+ + ('' if self.matching_tries == 1 else f'_tries{matching_tries}'))
173
+ self.popsize, self.maxiter = popsize, maxiter
174
+ self.matching, self.keep_original = matching, keep_original
175
+ self.num_conformers = num_conformers
176
+
177
+ self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
178
+ if not self.check_all_complexes():
179
+ os.makedirs(self.full_cache_path, exist_ok=True)
180
+ if protein_path_list is None or ligand_descriptions is None:
181
+ self.preprocessing()
182
+ else:
183
+ self.inference_preprocessing()
184
+
185
+ self.complex_graphs, self.rdkit_ligands = self.collect_all_complexes()
186
+ print_statistics(self.complex_graphs)
187
+ list_names = [complex['name'] for complex in self.complex_graphs]
188
+ with open(os.path.join(self.full_cache_path, f'pdbbind_{os.path.splitext(os.path.basename(self.split_path))[0][:3]}_names.txt'), 'w') as f:
189
+ f.write('\n'.join(list_names))
190
+
191
+ def len(self):
192
+ return len(self.complex_graphs)
193
+
194
+ def get(self, idx):
195
+ complex_graph = copy.deepcopy(self.complex_graphs[idx])
196
+ if self.require_ligand:
197
+ complex_graph.mol = RemoveAllHs(copy.deepcopy(self.rdkit_ligands[idx]))
198
+
199
+ for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq', 'to_keep', 'chain_ids']:
200
+ if hasattr(complex_graph, a):
201
+ delattr(complex_graph, a)
202
+ if hasattr(complex_graph['receptor'], a):
203
+ delattr(complex_graph['receptor'], a)
204
+
205
+ return complex_graph
206
+
207
+ def preprocessing(self):
208
+ print(f'Processing complexes from [{self.split_path}] and saving it to [{self.full_cache_path}]')
209
+
210
+ complex_names_all = read_strings_from_txt(self.split_path)
211
+ if self.limit_complexes is not None and self.limit_complexes != 0:
212
+ complex_names_all = complex_names_all[:self.limit_complexes]
213
+ print(f'Loading {len(complex_names_all)} complexes.')
214
+
215
+ if self.esm_embeddings_path is not None:
216
+ id_to_embeddings = torch.load(self.esm_embeddings_path)
217
+ chain_embeddings_dictlist = defaultdict(list)
218
+ chain_indices_dictlist = defaultdict(list)
219
+ for key, embedding in id_to_embeddings.items():
220
+ key_name = key.split('_chain_')[0]
221
+ if key_name in complex_names_all:
222
+ chain_embeddings_dictlist[key_name].append(embedding)
223
+ chain_indices_dictlist[key_name].append(int(key.split('_chain_')[1]))
224
+ lm_embeddings_chains_all = []
225
+ for name in complex_names_all:
226
+ complex_chains_embeddings = chain_embeddings_dictlist[name]
227
+ complex_chains_indices = chain_indices_dictlist[name]
228
+ chain_reorder_idx = np.argsort(complex_chains_indices)
229
+ reordered_chains = [complex_chains_embeddings[i] for i in chain_reorder_idx]
230
+ lm_embeddings_chains_all.append(reordered_chains)
231
+ else:
232
+ lm_embeddings_chains_all = [None] * len(complex_names_all)
233
+
234
+ # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
235
+ list_indices = list(range(len(complex_names_all)//1000+1))
236
+ random.shuffle(list_indices)
237
+ for i in list_indices:
238
+ if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
239
+ continue
240
+ complex_names = complex_names_all[1000*i:1000*(i+1)]
241
+ lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
242
+ complex_graphs, rdkit_ligands = [], []
243
+ if self.num_workers > 1:
244
+ p = Pool(self.num_workers, maxtasksperchild=1)
245
+ p.__enter__()
246
+ with tqdm(total=len(complex_names), desc=f'loading complexes {i}/{len(complex_names_all)//1000+1}') as pbar:
247
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
248
+ for t in map_fn(self.get_complex, zip(complex_names, lm_embeddings_chains, [None] * len(complex_names), [None] * len(complex_names))):
249
+ complex_graphs.extend(t[0])
250
+ rdkit_ligands.extend(t[1])
251
+ pbar.update()
252
+ if self.num_workers > 1: p.__exit__(None, None, None)
253
+
254
+ with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
255
+ pickle.dump((complex_graphs), f)
256
+ with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
257
+ pickle.dump((rdkit_ligands), f)
258
+
259
+ def inference_preprocessing(self):
260
+ ligands_list = []
261
+ print('Reading molecules and generating local structures with RDKit')
262
+ for ligand_description in tqdm(self.ligand_descriptions):
263
+ mol = MolFromSmiles(ligand_description) # check if it is a smiles or a path
264
+ if mol is not None:
265
+ mol = AddHs(mol)
266
+ generate_conformer(mol)
267
+ ligands_list.append(mol)
268
+ else:
269
+ mol = read_molecule(ligand_description, remove_hs=False, sanitize=True)
270
+ if not self.keep_local_structures:
271
+ mol.RemoveAllConformers()
272
+ mol = AddHs(mol)
273
+ generate_conformer(mol)
274
+ ligands_list.append(mol)
275
+
276
+ if self.esm_embeddings_path is not None:
277
+ print('Reading language model embeddings.')
278
+ lm_embeddings_chains_all = []
279
+ if not os.path.exists(self.esm_embeddings_path): raise Exception('ESM embeddings path does not exist: ',self.esm_embeddings_path)
280
+ for protein_path in self.protein_path_list:
281
+ embeddings_paths = sorted(glob.glob(os.path.join(self.esm_embeddings_path, os.path.basename(protein_path)) + '*'))
282
+ lm_embeddings_chains = []
283
+ for embeddings_path in embeddings_paths:
284
+ lm_embeddings_chains.append(torch.load(embeddings_path)['representations'][33])
285
+ lm_embeddings_chains_all.append(lm_embeddings_chains)
286
+ else:
287
+ lm_embeddings_chains_all = [None] * len(self.protein_path_list)
288
+
289
+ print('Generating graphs for ligands and proteins')
290
+ # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
291
+ list_indices = list(range(len(self.protein_path_list)//1000+1))
292
+ random.shuffle(list_indices)
293
+ for i in list_indices:
294
+ if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
295
+ continue
296
+ protein_paths_chunk = self.protein_path_list[1000*i:1000*(i+1)]
297
+ ligand_description_chunk = self.ligand_descriptions[1000*i:1000*(i+1)]
298
+ ligands_chunk = ligands_list[1000 * i:1000 * (i + 1)]
299
+ lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
300
+ complex_graphs, rdkit_ligands = [], []
301
+ if self.num_workers > 1:
302
+ p = Pool(self.num_workers, maxtasksperchild=1)
303
+ p.__enter__()
304
+ with tqdm(total=len(protein_paths_chunk), desc=f'loading complexes {i}/{len(protein_paths_chunk)//1000+1}') as pbar:
305
+ map_fn = p.imap_unordered if self.num_workers > 1 else map
306
+ for t in map_fn(self.get_complex, zip(protein_paths_chunk, lm_embeddings_chains, ligands_chunk,ligand_description_chunk)):
307
+ complex_graphs.extend(t[0])
308
+ rdkit_ligands.extend(t[1])
309
+ pbar.update()
310
+ if self.num_workers > 1: p.__exit__(None, None, None)
311
+
312
+ with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
313
+ pickle.dump((complex_graphs), f)
314
+ with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
315
+ pickle.dump((rdkit_ligands), f)
316
+
317
+ def check_all_complexes(self):
318
+ if os.path.exists(os.path.join(self.full_cache_path, f"heterographs.pkl")):
319
+ return True
320
+
321
+ complex_names_all = read_strings_from_txt(self.split_path)
322
+ if self.limit_complexes is not None and self.limit_complexes != 0:
323
+ complex_names_all = complex_names_all[:self.limit_complexes]
324
+ for i in range(len(complex_names_all) // 1000 + 1):
325
+ if not os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
326
+ return False
327
+ return True
328
+
329
+ def collect_all_complexes(self):
330
+ print('Collecting all complexes from cache', self.full_cache_path)
331
+ if os.path.exists(os.path.join(self.full_cache_path, f"heterographs.pkl")):
332
+ with open(os.path.join(self.full_cache_path, "heterographs.pkl"), 'rb') as f:
333
+ complex_graphs = pickle.load(f)
334
+ if self.require_ligand:
335
+ with open(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), 'rb') as f:
336
+ rdkit_ligands = pickle.load(f)
337
+ else:
338
+ rdkit_ligands = None
339
+ return complex_graphs, rdkit_ligands
340
+
341
+ complex_names_all = read_strings_from_txt(self.split_path)
342
+ if self.limit_complexes is not None and self.limit_complexes != 0:
343
+ complex_names_all = complex_names_all[:self.limit_complexes]
344
+ complex_graphs_all = []
345
+ for i in range(len(complex_names_all) // 1000 + 1):
346
+ with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'rb') as f:
347
+ print(i)
348
+ l = pickle.load(f)
349
+ complex_graphs_all.extend(l)
350
+
351
+ rdkit_ligands_all = []
352
+ for i in range(len(complex_names_all) // 1000 + 1):
353
+ with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
354
+ l = pickle.load(f)
355
+ rdkit_ligands_all.extend(l)
356
+
357
+ return complex_graphs_all, rdkit_ligands_all
358
+
359
+ def get_complex(self, par):
360
+ name, lm_embedding_chains, ligand, ligand_description = par
361
+ if not os.path.exists(os.path.join(self.pdbbind_dir, name)) and ligand is None:
362
+ print("Folder not found", name)
363
+ return [], []
364
+
365
+ try:
366
+
367
+ lig = read_mol(self.pdbbind_dir, name, suffix=self.ligand_file, remove_hs=False)
368
+ if self.max_lig_size != None and lig.GetNumHeavyAtoms() > self.max_lig_size:
369
+ print(f'Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data.')
370
+ return [], []
371
+
372
+ complex_graph = HeteroData()
373
+ complex_graph['name'] = name
374
+ get_lig_graph_with_matching(lig, complex_graph, self.popsize, self.maxiter, self.matching, self.keep_original,
375
+ self.num_conformers, remove_hs=self.remove_hs, tries=self.matching_tries)
376
+
377
+ moad_extract_receptor_structure(path=os.path.join(self.pdbbind_dir, name, f'{name}_{self.protein_file}.pdb'),
378
+ complex_graph=complex_graph,
379
+ neighbor_cutoff=self.receptor_radius,
380
+ max_neighbors=self.c_alpha_max_neighbors,
381
+ lm_embeddings=lm_embedding_chains,
382
+ knn_only_graph=self.knn_only_graph,
383
+ all_atoms=self.all_atoms,
384
+ atom_cutoff=self.atom_radius,
385
+ atom_max_neighbors=self.atom_max_neighbors)
386
+
387
+ except Exception as e:
388
+ print(f'Skipping {name} because of the error:')
389
+ print(e)
390
+ return [], []
391
+
392
+ if self.dataset == 'posebusters':
393
+ other_positions = []
394
+ all_mol_file = os.path.join(self.pdbbind_dir, name, f'{name}_ligands.sdf')
395
+ supplier = Chem.SDMolSupplier(all_mol_file, sanitize=False, removeHs=False)
396
+ for mol in supplier:
397
+ Chem.SanitizeMol(mol)
398
+ all_mol = RemoveAllHs(mol)
399
+ for conf in all_mol.GetConformers():
400
+ other_positions.append(conf.GetPositions())
401
+
402
+ print(f'Found {len(other_positions)} alternative poses for {name}')
403
+ complex_graph['ligand'].orig_pos = np.asarray(other_positions)
404
+
405
+ protein_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
406
+ complex_graph['receptor'].pos -= protein_center
407
+ if self.all_atoms:
408
+ complex_graph['atom'].pos -= protein_center
409
+
410
+ if (not self.matching) or self.num_conformers == 1:
411
+ complex_graph['ligand'].pos -= protein_center
412
+ else:
413
+ for p in complex_graph['ligand'].pos:
414
+ p -= protein_center
415
+
416
+ complex_graph.original_center = protein_center
417
+ complex_graph['receptor_name'] = name
418
+ return [complex_graph], [lig]
419
+
420
+
421
+ def print_statistics(complex_graphs):
422
+ statistics = ([], [], [], [], [], [])
423
+ receptor_sizes = []
424
+
425
+ for complex_graph in complex_graphs:
426
+ lig_pos = complex_graph['ligand'].pos if torch.is_tensor(complex_graph['ligand'].pos) else complex_graph['ligand'].pos[0]
427
+ receptor_sizes.append(complex_graph['receptor'].pos.shape[0])
428
+ radius_protein = torch.max(torch.linalg.vector_norm(complex_graph['receptor'].pos, dim=1))
429
+ molecule_center = torch.mean(lig_pos, dim=0)
430
+ radius_molecule = torch.max(
431
+ torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1))
432
+ distance_center = torch.linalg.vector_norm(molecule_center)
433
+ statistics[0].append(radius_protein)
434
+ statistics[1].append(radius_molecule)
435
+ statistics[2].append(distance_center)
436
+ if "rmsd_matching" in complex_graph:
437
+ statistics[3].append(complex_graph.rmsd_matching)
438
+ else:
439
+ statistics[3].append(0)
440
+ statistics[4].append(int(complex_graph.random_coords) if "random_coords" in complex_graph else -1)
441
+ if "random_coords" in complex_graph and complex_graph.random_coords and "rmsd_matching" in complex_graph:
442
+ statistics[5].append(complex_graph.rmsd_matching)
443
+
444
+ if len(statistics[5]) == 0:
445
+ statistics[5].append(-1)
446
+ name = ['radius protein', 'radius molecule', 'distance protein-mol', 'rmsd matching', 'random coordinates', 'random rmsd matching']
447
+ print('Number of complexes: ', len(complex_graphs))
448
+ for i in range(len(name)):
449
+ array = np.asarray(statistics[i])
450
+ print(f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}")
451
+
452
+ return
453
+
454
+
455
+ def read_mol(pdbbind_dir, name, suffix='ligand', remove_hs=False):
456
+ lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_{suffix}.sdf'), remove_hs=remove_hs, sanitize=True)
457
+ if lig is None: # read mol2 file if sdf file cannot be sanitized
458
+ lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_{suffix}.mol2'), remove_hs=remove_hs, sanitize=True)
459
+ return lig
460
+
461
+
462
+ def read_mols(pdbbind_dir, name, remove_hs=False):
463
+ ligs = []
464
+ for file in os.listdir(os.path.join(pdbbind_dir, name)):
465
+ if file.endswith(".sdf") and 'rdkit' not in file:
466
+ lig = read_molecule(os.path.join(pdbbind_dir, name, file), remove_hs=remove_hs, sanitize=True)
467
+ if lig is None and os.path.exists(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2")): # read mol2 file if sdf file cannot be sanitized
468
+ print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
469
+ lig = read_molecule(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2"), remove_hs=remove_hs, sanitize=True)
470
+ if lig is not None:
471
+ ligs.append(lig)
472
+ return ligs
datasets/process_mols.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import warnings
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import torch
7
+ from Bio.PDB import PDBParser
8
+ from rdkit import Chem
9
+ from rdkit.Chem.rdchem import BondType as BT
10
+ from rdkit.Chem import AllChem, GetPeriodicTable, RemoveHs
11
+ from rdkit.Geometry import Point3D
12
+ from torch import cdist
13
+ from torch_cluster import knn_graph
14
+ import prody as pr
15
+
16
+ import torch.nn.functional as F
17
+
18
+ from datasets.conformer_matching import get_torsion_angles, optimize_rotatable_bonds
19
+ from datasets.constants import aa_short2long, atom_order, three_to_one
20
+ from datasets.parse_chi import get_chi_angles, get_coords, aa_idx2aa_short, get_onehot_sequence
21
+ from utils.torsion import get_transformation_mask
22
+ from utils.logging_utils import get_logger
23
+
24
+ logger = get_logger()
25
+
26
+ periodic_table = GetPeriodicTable()
27
+ allowable_features = {
28
+ 'possible_atomic_num_list': list(range(1, 119)) + ['misc'],
29
+ 'possible_chirality_list': [
30
+ 'CHI_UNSPECIFIED',
31
+ 'CHI_TETRAHEDRAL_CW',
32
+ 'CHI_TETRAHEDRAL_CCW',
33
+ 'CHI_OTHER'
34
+ ],
35
+ 'possible_degree_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'misc'],
36
+ 'possible_numring_list': [0, 1, 2, 3, 4, 5, 6, 'misc'],
37
+ 'possible_implicit_valence_list': [0, 1, 2, 3, 4, 5, 6, 'misc'],
38
+ 'possible_formal_charge_list': [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 'misc'],
39
+ 'possible_numH_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 'misc'],
40
+ 'possible_number_radical_e_list': [0, 1, 2, 3, 4, 'misc'],
41
+ 'possible_hybridization_list': [
42
+ 'SP', 'SP2', 'SP3', 'SP3D', 'SP3D2', 'misc'
43
+ ],
44
+ 'possible_is_aromatic_list': [False, True],
45
+ 'possible_is_in_ring3_list': [False, True],
46
+ 'possible_is_in_ring4_list': [False, True],
47
+ 'possible_is_in_ring5_list': [False, True],
48
+ 'possible_is_in_ring6_list': [False, True],
49
+ 'possible_is_in_ring7_list': [False, True],
50
+ 'possible_is_in_ring8_list': [False, True],
51
+ 'possible_amino_acids': ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET',
52
+ 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'HIP', 'HIE', 'TPO', 'HID', 'LEV', 'MEU',
53
+ 'PTR', 'GLV', 'CYT', 'SEP', 'HIZ', 'CYM', 'GLM', 'ASQ', 'TYS', 'CYX', 'GLZ', 'misc'],
54
+ 'possible_atom_type_2': ['C*', 'CA', 'CB', 'CD', 'CE', 'CG', 'CH', 'CZ', 'N*', 'ND', 'NE', 'NH', 'NZ', 'O*', 'OD',
55
+ 'OE', 'OG', 'OH', 'OX', 'S*', 'SD', 'SG', 'misc'],
56
+ 'possible_atom_type_3': ['C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2',
57
+ 'CZ', 'CZ2', 'CZ3', 'N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ', 'O', 'OD1',
58
+ 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH', 'OXT', 'SD', 'SG', 'misc'],
59
+ }
60
+ bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}
61
+
62
+ lig_feature_dims = (list(map(len, [
63
+ allowable_features['possible_atomic_num_list'],
64
+ allowable_features['possible_chirality_list'],
65
+ allowable_features['possible_degree_list'],
66
+ allowable_features['possible_formal_charge_list'],
67
+ allowable_features['possible_implicit_valence_list'],
68
+ allowable_features['possible_numH_list'],
69
+ allowable_features['possible_number_radical_e_list'],
70
+ allowable_features['possible_hybridization_list'],
71
+ allowable_features['possible_is_aromatic_list'],
72
+ allowable_features['possible_numring_list'],
73
+ allowable_features['possible_is_in_ring3_list'],
74
+ allowable_features['possible_is_in_ring4_list'],
75
+ allowable_features['possible_is_in_ring5_list'],
76
+ allowable_features['possible_is_in_ring6_list'],
77
+ allowable_features['possible_is_in_ring7_list'],
78
+ allowable_features['possible_is_in_ring8_list'],
79
+ ])), 0) # number of scalar features
80
+
81
+ rec_atom_feature_dims = (list(map(len, [
82
+ allowable_features['possible_amino_acids'],
83
+ allowable_features['possible_atomic_num_list'],
84
+ allowable_features['possible_atom_type_2'],
85
+ allowable_features['possible_atom_type_3'],
86
+ ])), 0)
87
+
88
+ rec_residue_feature_dims = (list(map(len, [
89
+ allowable_features['possible_amino_acids']
90
+ ])), 0)
91
+
92
+
93
+ def lig_atom_featurizer(mol):
94
+ ringinfo = mol.GetRingInfo()
95
+ atom_features_list = []
96
+ for idx, atom in enumerate(mol.GetAtoms()):
97
+ chiral_tag = str(atom.GetChiralTag())
98
+ if chiral_tag in ['CHI_SQUAREPLANAR', 'CHI_TRIGONALBIPYRAMIDAL', 'CHI_OCTAHEDRAL']:
99
+ chiral_tag = 'CHI_OTHER'
100
+
101
+ atom_features_list.append([
102
+ safe_index(allowable_features['possible_atomic_num_list'], atom.GetAtomicNum()),
103
+ allowable_features['possible_chirality_list'].index(str(chiral_tag)),
104
+ safe_index(allowable_features['possible_degree_list'], atom.GetTotalDegree()),
105
+ safe_index(allowable_features['possible_formal_charge_list'], atom.GetFormalCharge()),
106
+ safe_index(allowable_features['possible_implicit_valence_list'], atom.GetImplicitValence()),
107
+ safe_index(allowable_features['possible_numH_list'], atom.GetTotalNumHs()),
108
+ safe_index(allowable_features['possible_number_radical_e_list'], atom.GetNumRadicalElectrons()),
109
+ safe_index(allowable_features['possible_hybridization_list'], str(atom.GetHybridization())),
110
+ allowable_features['possible_is_aromatic_list'].index(atom.GetIsAromatic()),
111
+ safe_index(allowable_features['possible_numring_list'], ringinfo.NumAtomRings(idx)),
112
+ allowable_features['possible_is_in_ring3_list'].index(ringinfo.IsAtomInRingOfSize(idx, 3)),
113
+ allowable_features['possible_is_in_ring4_list'].index(ringinfo.IsAtomInRingOfSize(idx, 4)),
114
+ allowable_features['possible_is_in_ring5_list'].index(ringinfo.IsAtomInRingOfSize(idx, 5)),
115
+ allowable_features['possible_is_in_ring6_list'].index(ringinfo.IsAtomInRingOfSize(idx, 6)),
116
+ allowable_features['possible_is_in_ring7_list'].index(ringinfo.IsAtomInRingOfSize(idx, 7)),
117
+ allowable_features['possible_is_in_ring8_list'].index(ringinfo.IsAtomInRingOfSize(idx, 8)),
118
+ #g_charge if not np.isnan(g_charge) and not np.isinf(g_charge) else 0.
119
+ ])
120
+ return torch.tensor(atom_features_list)
121
+
122
+
123
+ def safe_index(l, e):
124
+ """ Return index of element e in list l. If e is not present, return the last index """
125
+ try:
126
+ return l.index(e)
127
+ except:
128
+ return len(l) - 1
129
+
130
+
131
+ def moad_extract_receptor_structure(path, complex_graph, neighbor_cutoff=20, max_neighbors=None, sequences_to_embeddings=None,
132
+ knn_only_graph=False, lm_embeddings=None, all_atoms=False, atom_cutoff=None, atom_max_neighbors=None):
133
+ # load the entire pdb file
134
+ pdb = pr.parsePDB(str(path))
135
+ seq = pdb.ca.getSequence()
136
+ coords = get_coords(pdb)
137
+ one_hot = get_onehot_sequence(seq)
138
+
139
+ chain_ids = np.zeros(len(one_hot))
140
+ res_chain_ids = pdb.ca.getChids()
141
+ res_seg_ids = pdb.ca.getSegnames()
142
+ res_chain_ids = np.asarray([s + c for s, c in zip(res_seg_ids, res_chain_ids)])
143
+ ids = np.unique(res_chain_ids)
144
+ sequences = []
145
+ lm_embeddings = lm_embeddings if sequences_to_embeddings is None else []
146
+
147
+ for i, id in enumerate(ids):
148
+ chain_ids[res_chain_ids == id] = i
149
+
150
+ s = np.argmax(one_hot[res_chain_ids == id], axis=1)
151
+ s = ''.join([aa_idx2aa_short[aa_idx] for aa_idx in s])
152
+ sequences.append(s)
153
+ if sequences_to_embeddings is not None:
154
+ lm_embeddings.append(sequences_to_embeddings[s])
155
+
156
+ complex_graph['receptor'].sequence = sequences
157
+ complex_graph['receptor'].chain_ids = torch.from_numpy(np.asarray(chain_ids)).long()
158
+
159
+ new_extract_receptor_structure(seq, coords, complex_graph, neighbor_cutoff=neighbor_cutoff, max_neighbors=max_neighbors,
160
+ lm_embeddings=lm_embeddings, knn_only_graph=knn_only_graph, all_atoms=all_atoms,
161
+ atom_cutoff=atom_cutoff, atom_max_neighbors=atom_max_neighbors)
162
+
163
+
164
+ def new_extract_receptor_structure(seq, all_coords, complex_graph, neighbor_cutoff=20, max_neighbors=None, lm_embeddings=None,
165
+ knn_only_graph=False, all_atoms=False, atom_cutoff=None, atom_max_neighbors=None):
166
+ chi_angles, one_hot = get_chi_angles(all_coords, seq, return_onehot=True)
167
+ n_rel_pos, c_rel_pos = all_coords[:, 0, :] - all_coords[:, 1, :], all_coords[:, 2, :] - all_coords[:, 1, :]
168
+ side_chain_vecs = torch.from_numpy(np.concatenate([chi_angles / 360, n_rel_pos, c_rel_pos], axis=1))
169
+
170
+ # Build the k-NN graph
171
+ coords = torch.tensor(all_coords[:, 1, :], dtype=torch.float)
172
+ if len(coords) > 3000:
173
+ raise ValueError(f'The receptor is too large {len(coords)}')
174
+ if knn_only_graph:
175
+ edge_index = knn_graph(coords, k=max_neighbors if max_neighbors else 32)
176
+ else:
177
+ distances = cdist(coords, coords)
178
+ src_list = []
179
+ dst_list = []
180
+ for i in range(len(coords)):
181
+ dst = list(np.where(distances[i, :] < neighbor_cutoff)[0])
182
+ dst.remove(i)
183
+ max_neighbors = max_neighbors if max_neighbors else 1000
184
+ if max_neighbors != None and len(dst) > max_neighbors:
185
+ dst = list(np.argsort(distances[i, :]))[1: max_neighbors + 1]
186
+ if len(dst) == 0:
187
+ dst = list(np.argsort(distances[i, :]))[1:2] # choose second because first is i itself
188
+ print(
189
+ f'The cutoff {neighbor_cutoff} was too small for one atom such that it had no neighbors. '
190
+ f'So we connected it to the closest other atom')
191
+ assert i not in dst
192
+ src = [i] * len(dst)
193
+ src_list.extend(src)
194
+ dst_list.extend(dst)
195
+ edge_index = torch.from_numpy(np.asarray([dst_list, src_list]))
196
+
197
+ res_names_list = [aa_short2long[seq[i]] if seq[i] in aa_short2long else 'misc' for i in range(len(seq))]
198
+ feature_list = [[safe_index(allowable_features['possible_amino_acids'], res)] for res in res_names_list]
199
+ node_feat = torch.tensor(feature_list, dtype=torch.float32)
200
+
201
+ lm_embeddings = torch.tensor(np.concatenate(lm_embeddings, axis=0)) if lm_embeddings is not None else None
202
+ complex_graph['receptor'].x = torch.cat([node_feat, lm_embeddings], axis=1) if lm_embeddings is not None else node_feat
203
+ complex_graph['receptor'].pos = coords
204
+ complex_graph['receptor'].side_chain_vecs = side_chain_vecs.float()
205
+ complex_graph['receptor', 'rec_contact', 'receptor'].edge_index = edge_index
206
+ if all_atoms:
207
+ atom_coords = all_coords.reshape(-1, 3)
208
+ atom_coords = torch.from_numpy(atom_coords[~np.any(np.isnan(atom_coords), axis=1)]).float()
209
+
210
+ if knn_only_graph:
211
+ atoms_edge_index = knn_graph(atom_coords, k=atom_max_neighbors if atom_max_neighbors else 1000)
212
+ else:
213
+ atoms_distances = cdist(atom_coords, atom_coords)
214
+ atom_src_list = []
215
+ atom_dst_list = []
216
+ for i in range(len(atom_coords)):
217
+ dst = list(np.where(atoms_distances[i, :] < atom_cutoff)[0])
218
+ dst.remove(i)
219
+ max_neighbors = atom_max_neighbors if atom_max_neighbors else 1000
220
+ if max_neighbors != None and len(dst) > max_neighbors:
221
+ dst = list(np.argsort(atoms_distances[i, :]))[1: max_neighbors + 1]
222
+ if len(dst) == 0:
223
+ dst = list(np.argsort(atoms_distances[i, :]))[1:2] # choose second because first is i itself
224
+ print(
225
+ f'The atom_cutoff {atom_cutoff} was too small for one atom such that it had no neighbors. '
226
+ f'So we connected it to the closest other atom')
227
+ assert i not in dst
228
+ src = [i] * len(dst)
229
+ atom_src_list.extend(src)
230
+ atom_dst_list.extend(dst)
231
+ atoms_edge_index = torch.from_numpy(np.asarray([atom_dst_list, atom_src_list]))
232
+
233
+ feats = [get_moad_atom_feats(res, all_coords[i]) for i, res in enumerate(seq)]
234
+ atom_feat = torch.from_numpy(np.concatenate(feats, axis=0)).float()
235
+ c_alpha_idx = np.concatenate([np.zeros(len(f)) + i for i, f in enumerate(feats)])
236
+ np_array = np.stack([np.arange(len(atom_feat)), c_alpha_idx])
237
+ atom_res_edge_index = torch.from_numpy(np_array).long()
238
+ complex_graph['atom'].x = atom_feat
239
+ complex_graph['atom'].pos = atom_coords
240
+ assert len(complex_graph['atom'].x) == len(complex_graph['atom'].pos)
241
+ complex_graph['atom', 'atom_contact', 'atom'].edge_index = atoms_edge_index
242
+ complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index = atom_res_edge_index
243
+
244
+ return
245
+
246
+
247
+ def get_moad_atom_feats(res, coords):
248
+ feats = []
249
+ res_long = aa_short2long[res]
250
+ res_order = atom_order[res]
251
+ for i, c in enumerate(coords):
252
+ if np.any(np.isnan(c)):
253
+ continue
254
+ atom_feats = []
255
+ if res == '-':
256
+ atom_feats = [safe_index(allowable_features['possible_amino_acids'], 'misc'),
257
+ safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
258
+ safe_index(allowable_features['possible_atom_type_2'], 'misc'),
259
+ safe_index(allowable_features['possible_atom_type_3'], 'misc')]
260
+ else:
261
+ atom_feats.append(safe_index(allowable_features['possible_amino_acids'], res_long))
262
+ if i >= len(res_order):
263
+ atom_feats.extend([safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
264
+ safe_index(allowable_features['possible_atom_type_2'], 'misc'),
265
+ safe_index(allowable_features['possible_atom_type_3'], 'misc')])
266
+ else:
267
+ atom_name = res_order[i]
268
+ try:
269
+ atomic_num = periodic_table.GetAtomicNumber(atom_name[:1])
270
+ except:
271
+ print("element", res_order[i][:1], 'not found')
272
+ atomic_num = -1
273
+
274
+ atom_feats.extend([safe_index(allowable_features['possible_atomic_num_list'], atomic_num),
275
+ safe_index(allowable_features['possible_atom_type_2'], (atom_name + '*')[:2]),
276
+ safe_index(allowable_features['possible_atom_type_3'], atom_name)])
277
+ feats.append(atom_feats)
278
+ feats = np.asarray(feats)
279
+ return feats
280
+
281
+
282
+ def get_lig_graph(mol, complex_graph):
283
+ atom_feats = lig_atom_featurizer(mol)
284
+
285
+ row, col, edge_type = [], [], []
286
+ for bond in mol.GetBonds():
287
+ start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
288
+ row += [start, end]
289
+ col += [end, start]
290
+ edge_type += 2 * [bonds[bond.GetBondType()]] if bond.GetBondType() != BT.UNSPECIFIED else [0, 0]
291
+
292
+ edge_index = torch.tensor([row, col], dtype=torch.long)
293
+ edge_type = torch.tensor(edge_type, dtype=torch.long)
294
+ edge_attr = F.one_hot(edge_type, num_classes=len(bonds)).to(torch.float)
295
+
296
+ complex_graph['ligand'].x = atom_feats
297
+ complex_graph['ligand', 'lig_bond', 'ligand'].edge_index = edge_index
298
+ complex_graph['ligand', 'lig_bond', 'ligand'].edge_attr = edge_attr
299
+
300
+ if mol.GetNumConformers() > 0:
301
+ lig_coords = torch.from_numpy(mol.GetConformer().GetPositions()).float()
302
+ complex_graph['ligand'].pos = lig_coords
303
+
304
+ return
305
+
306
+
307
+ def generate_conformer(mol):
308
+ ps = AllChem.ETKDGv2()
309
+ failures, id = 0, -1
310
+ while failures < 3 and id == -1:
311
+ if failures > 0:
312
+ logger.debug(f'rdkit coords could not be generated. trying again {failures}.')
313
+ id = AllChem.EmbedMolecule(mol, ps)
314
+ failures += 1
315
+ if id == -1:
316
+ logger.info('rdkit coords could not be generated without using random coords. using random coords now.')
317
+ ps.useRandomCoords = True
318
+ AllChem.EmbedMolecule(mol, ps)
319
+ AllChem.MMFFOptimizeMolecule(mol, confId=0)
320
+ return True
321
+ #else:
322
+ # AllChem.MMFFOptimizeMolecule(mol, confId=0)
323
+ return False
324
+
325
+
326
+ def get_lig_graph_with_matching(mol_, complex_graph, popsize, maxiter, matching, keep_original, num_conformers, remove_hs, tries=10, skip_matching=False):
327
+ if matching:
328
+ mol_maybe_noh = copy.deepcopy(mol_)
329
+ if remove_hs:
330
+ mol_maybe_noh = RemoveHs(mol_maybe_noh, sanitize=True)
331
+ mol_maybe_noh = AllChem.RemoveAllHs(mol_maybe_noh)
332
+ if keep_original:
333
+ positions = []
334
+ for conf in mol_maybe_noh.GetConformers():
335
+ positions.append(conf.GetPositions())
336
+ complex_graph['ligand'].orig_pos = np.asarray(positions) if len(positions) > 1 else positions[0]
337
+
338
+ # rotatable_bonds = get_torsion_angles(mol_maybe_noh)
339
+ _tmp = copy.deepcopy(mol_)
340
+ if remove_hs:
341
+ _tmp = RemoveHs(_tmp, sanitize=True)
342
+ _tmp = AllChem.RemoveAllHs(_tmp)
343
+ rotatable_bonds = get_torsion_angles(_tmp)
344
+
345
+ for i in range(num_conformers):
346
+ mols, rmsds = [], []
347
+ for _ in range(tries):
348
+ mol_rdkit = copy.deepcopy(mol_)
349
+
350
+ mol_rdkit.RemoveAllConformers()
351
+ mol_rdkit = AllChem.AddHs(mol_rdkit)
352
+ generate_conformer(mol_rdkit)
353
+ if remove_hs:
354
+ mol_rdkit = RemoveHs(mol_rdkit, sanitize=True)
355
+ mol_rdkit = AllChem.RemoveAllHs(mol_rdkit)
356
+ mol = AllChem.RemoveAllHs(copy.deepcopy(mol_maybe_noh))
357
+ if rotatable_bonds and not skip_matching:
358
+ optimize_rotatable_bonds(mol_rdkit, mol, rotatable_bonds, popsize=popsize, maxiter=maxiter)
359
+ mol.AddConformer(mol_rdkit.GetConformer())
360
+ rms_list = []
361
+ AllChem.AlignMolConformers(mol, RMSlist=rms_list)
362
+ mol_rdkit.RemoveAllConformers()
363
+ mol_rdkit.AddConformer(mol.GetConformers()[1])
364
+ mols.append(mol_rdkit)
365
+ rmsds.append(rms_list[0])
366
+
367
+ # select molecule with lowest rmsd
368
+ #print("mean std min max", np.mean(rmsds), np.std(rmsds), np.min(rmsds), np.max(rmsds))
369
+ mol_rdkit = mols[np.argmin(rmsds)]
370
+ if i == 0:
371
+ complex_graph.rmsd_matching = min(rmsds)
372
+ get_lig_graph(mol_rdkit, complex_graph)
373
+ else:
374
+ if torch.is_tensor(complex_graph['ligand'].pos):
375
+ complex_graph['ligand'].pos = [complex_graph['ligand'].pos]
376
+ complex_graph['ligand'].pos.append(torch.from_numpy(mol_rdkit.GetConformer().GetPositions()).float())
377
+
378
+ else: # no matching
379
+ complex_graph.rmsd_matching = 0
380
+ if remove_hs: mol_ = RemoveHs(mol_)
381
+ get_lig_graph(mol_, complex_graph)
382
+
383
+ edge_mask, mask_rotate = get_transformation_mask(complex_graph)
384
+ complex_graph['ligand'].edge_mask = torch.tensor(edge_mask)
385
+ complex_graph['ligand'].mask_rotate = mask_rotate
386
+
387
+ return
388
+
389
+
390
+ def get_rec_misc_atom_feat(bio_atom=None, atom_name=None, element=None, get_misc_features=False):
391
+ if get_misc_features:
392
+ return [safe_index(allowable_features['possible_amino_acids'], 'misc'),
393
+ safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
394
+ safe_index(allowable_features['possible_atom_type_2'], 'misc'),
395
+ safe_index(allowable_features['possible_atom_type_3'], 'misc')]
396
+ if atom_name is not None:
397
+ atom_name = atom_name
398
+ else:
399
+ atom_name = bio_atom.name
400
+ if element is not None:
401
+ element = element
402
+ else:
403
+ element = bio_atom.element
404
+ if element == 'CD':
405
+ element = 'C'
406
+ assert not element == ''
407
+ try:
408
+ atomic_num = periodic_table.GetAtomicNumber(element.lower().capitalize())
409
+ except:
410
+ atomic_num = -1
411
+
412
+ atom_feat = [safe_index(allowable_features['possible_amino_acids'], bio_atom.get_parent().get_resname()),
413
+ safe_index(allowable_features['possible_atomic_num_list'], atomic_num),
414
+ safe_index(allowable_features['possible_atom_type_2'], (atom_name + '*')[:2]),
415
+ safe_index(allowable_features['possible_atom_type_3'], atom_name)]
416
+ return atom_feat
417
+
418
+
419
+ def write_mol_with_coords(mol, new_coords, path):
420
+ w = Chem.SDWriter(path)
421
+ conf = mol.GetConformer()
422
+ for i in range(mol.GetNumAtoms()):
423
+ x,y,z = new_coords.astype(np.double)[i]
424
+ conf.SetAtomPosition(i,Point3D(x,y,z))
425
+ w.write(mol)
426
+ w.close()
427
+
428
+
429
+ def create_mol_with_coords(mol, new_coords, path=None):
430
+ conf = mol.GetConformer()
431
+ for i in range(mol.GetNumAtoms()):
432
+ x, y, z = new_coords[i]
433
+ conf.SetAtomPosition(i, Point3D(float(x), float(y), float(z)))
434
+ if path:
435
+ w = Chem.SDWriter(path)
436
+ w.write(mol)
437
+ w.close()
438
+ return mol
439
+
440
+
441
+ def read_molecule(ligand_description, sanitize=False, calc_charges=False, remove_hs=False, remove_confs=False):
442
+ mol = None
443
+ # Check if ligand_description is a path to a file
444
+ if Path(ligand_description).is_absolute() or len(Path(ligand_description).parts) > 1:
445
+ path = Path(ligand_description)
446
+ if path.is_file():
447
+ match path.suffix:
448
+ case '.mol':
449
+ mol = Chem.MolFromMolFile(str(path), sanitize=False, removeHs=True)
450
+ case '.mol2':
451
+ mol = Chem.MolFromMol2File(str(path), sanitize=False, removeHs=False)
452
+ case '.sdf':
453
+ supplier = Chem.SDMolSupplier(str(path), sanitize=False, removeHs=False)
454
+ mol = supplier[0]
455
+ case '.pdbqt':
456
+ with open(path) as file:
457
+ pdbqt_data = file.readlines()
458
+ pdb_block = ''
459
+ for line in pdbqt_data:
460
+ pdb_block += '{}\n'.format(line[:66])
461
+ mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False)
462
+ case '.pdb':
463
+ mol = Chem.MolFromPDBFile(str(path), sanitize=False, removeHs=False)
464
+ case _:
465
+ logger.warning(f'Expect the format of the molecule file to be '
466
+ f'one of .mol2, .sdf, .pdbqt and .pdb, got {ligand_description}')
467
+ else:
468
+ raise FileNotFoundError(f'File {ligand_description} not found.')
469
+ else:
470
+ mol = Chem.MolFromSmiles(ligand_description)
471
+ # No need to remove conformers if the molecule is not read from a file
472
+ remove_confs = False
473
+
474
+ if mol is not None:
475
+ try:
476
+ if sanitize or calc_charges:
477
+ Chem.SanitizeMol(mol)
478
+
479
+ if calc_charges:
480
+ # Compute Gasteiger charges on the molecule.
481
+ try:
482
+ AllChem.ComputeGasteigerCharges(mol)
483
+ except:
484
+ warnings.warn('Unable to compute charges for the molecule.')
485
+
486
+ if remove_hs:
487
+ mol = Chem.RemoveHs(mol, sanitize=sanitize)
488
+
489
+ if remove_confs:
490
+ mol.RemoveAllConformers()
491
+
492
+ except Exception as e:
493
+ # Print stacktrace
494
+ import traceback
495
+ msg = traceback.format_exc()
496
+ logger.warning(f"Failed to process molecule: {ligand_description}\n{msg}")
497
+ return None
498
+
499
+ return mol
datasets/sidechain_esm_embeddings_to_pt.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import pickle
4
+ from argparse import ArgumentParser
5
+
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+
10
+ parser = ArgumentParser()
11
+ parser.add_argument('--esm_embeddings_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new', help='')
12
+ parser.add_argument('--output_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new.pt', help='')
13
+ args = parser.parse_args()
14
+
15
+ dic = {}
16
+
17
+ # read text file with all sequences
18
+ with open('data/pdb_2021aug02/sequences_to_id.fasta') as f:
19
+ lines = f.readlines()
20
+
21
+ # read sequences
22
+ with open('data/pdb_2021aug02/useful_sequences.pkl', 'rb') as f:
23
+ sequences = pickle.load(f)
24
+
25
+ ids = set()
26
+ dict_seq_id = {seq[:-1]: str(id) for id, seq in enumerate(lines)}
27
+
28
+ for i, seq in tqdm(enumerate(sequences)):
29
+ ids.add(dict_seq_id[seq])
30
+ if i == 20000: break
31
+
32
+ print("total", len(ids), "out of", len(os.listdir(args.esm_embeddings_path)))
33
+
34
+ available = set([filename.split('.')[0] for filename in os.listdir(args.esm_embeddings_path)])
35
+ final = available.intersection(ids)
36
+
37
+ for idp in tqdm(final):
38
+ dic[idp] = torch.load(os.path.join(args.esm_embeddings_path, idp+'.pt'))['representations'][33]
39
+ torch.save(dic,args.output_path)
inference.py CHANGED
@@ -45,7 +45,6 @@ from utils.diffusion_utils import t_to_sigma as t_to_sigma_compl, get_t_schedule
45
  from utils.inference_utils import InferenceDataset
46
  from utils.sampling import randomize_position, sampling
47
  from utils.utils import get_model
48
- from utils.visualise import PDBFile
49
  from tqdm import tqdm
50
 
51
  configure_logger()
@@ -622,7 +621,7 @@ def extract_pockets(protein_path, ligand_residue=None, top_pockets=None):
622
  # Run fpocket
623
  distance = 2.5
624
  min_size = 30
625
- args = ['fpocket', '-d', '-f', tmp_protein_path, '-D', str(distance), '-i', str(min_size)]
626
  if ligand_residue is not None:
627
  args += ['-r', ligand_residue]
628
  print(args)
 
45
  from utils.inference_utils import InferenceDataset
46
  from utils.sampling import randomize_position, sampling
47
  from utils.utils import get_model
 
48
  from tqdm import tqdm
49
 
50
  configure_logger()
 
621
  # Run fpocket
622
  distance = 2.5
623
  min_size = 30
624
+ args = ['./fpocket', '-d', '-f', tmp_protein_path, '-D', str(distance), '-i', str(min_size)]
625
  if ligand_residue is not None:
626
  args += ['-r', ligand_residue]
627
  print(args)
requirements.txt CHANGED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deep learning
2
+ --extra-index-url https://download.pytorch.org/whl/cu118
3
+ torch==2.4
4
+ lightning==2.4
5
+ torch-geometric
6
+ https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_cluster-1.6.3%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
7
+ https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_scatter-2.1.2%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
8
+ https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_sparse-0.6.18%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
9
+ https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_spline_conv-1.2.2%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
10
+ scipy
11
+ fair-esm
12
+ e3nn
13
+ scikit-learn
14
+
15
+ # configuration and utilities
16
+ hydra-core
17
+ pandas
18
+ pandarallel
19
+ panel
20
+ seaborn
21
+ apscheduler
22
+ tinydb
23
+ email-validator
24
+ gradio_rangeslider
25
+
26
+ # bio/cheminformatics
27
+ biopython
28
+ rdkit
29
+ openbabel-wheel
30
+ prody
resources/animations/example_3.gif DELETED
Binary file (493 kB)
 
resources/animations/example_6.gif DELETED
Binary file (750 kB)
 
resources/linker_size_distributions.png DELETED
Binary file (16.3 kB)
 
resources/moad_test_pdbs.txt DELETED
@@ -1,90 +0,0 @@
1
- 1j78
2
- 6c0b
3
- 1j6z
4
- 2uyi
5
- 6cfg
6
- 5ncg
7
- 1esv
8
- 3spf
9
- 5yee
10
- 1bxm
11
- 3zln
12
- 4b1w
13
- 2gm1
14
- 1yrs
15
- 3ukr
16
- 5xdv
17
- 1efi
18
- 1fd7
19
- 2x2r
20
- 2a5x
21
- 1lri
22
- 1rvx
23
- 3ken
24
- 6rvp
25
- 5ncp
26
- 3pa8
27
- 3eks
28
- 1nwk
29
- 5n69
30
- 3f7i
31
- 4b1x
32
- 7aki
33
- 5neg
34
- 2fl6
35
- 2wog
36
- 5ncf
37
- 3sjh
38
- 6qgk
39
- 3zjx
40
- 3gt9
41
- 2x7c
42
- 2uym
43
- 2fky
44
- 4dxd
45
- 3zcw
46
- 1dtl
47
- 2q2y
48
- 6vmz
49
- 2vl8
50
- 4b1v
51
- 4b1y
52
- 1sqk
53
- 1eei
54
- 2pg2
55
- 3cjo
56
- 3gta
57
- 6pb9
58
- 5zzb
59
- 4pa0
60
- 6hmy
61
- 2i3i
62
- 6rcj
63
- 2fme
64
- 4b1u
65
- 2x7d
66
- 3lkf
67
- 2fl2
68
- 2x7e
69
- 6kvp
70
- 2q0u
71
- 3f7h
72
- 6g6y
73
- 3mn5
74
- 1rdw
75
- 1mfi
76
- 5xdt
77
- 6hmw
78
- 1q0b
79
- 5ndu
80
- 6rcf
81
- 6p7r
82
- 5lzj
83
- 1rvt
84
- 1oke
85
- 3k3b
86
- 1lt6
87
- 5xdu
88
- 3l9h
89
- 1rv0
90
- 3eku
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/moad_val_pdbs.txt DELETED
@@ -1,153 +0,0 @@
1
- 2e27
2
- 2ajy
3
- 2bmk
4
- 6npi
5
- 1yec
6
- 6lam
7
- 25c8
8
- 1xuo
9
- 2ykl
10
- 3f78
11
- 2ajx
12
- 6vh6
13
- 4mrh
14
- 2r2e
15
- 3fo1
16
- 4np2
17
- 3gmm
18
- 6lt6
19
- 1mh5
20
- 6npp
21
- 6v3j
22
- 2ajz
23
- 1d6v
24
- 1x9q
25
- 1ngp
26
- 1i7z
27
- 2hxw
28
- 2ica
29
- 1gaf
30
- 4mre
31
- 1xdg
32
- 1dl7
33
- 3vrj
34
- 2yip
35
- 2bjm
36
- 3vkx
37
- 6be3
38
- 2o5z
39
- 3m6f
40
- 2z92
41
- 4l4v
42
- 2hvk
43
- 1wc7
44
- 1c1e
45
- 2c1p
46
- 1usq
47
- 1aj7
48
- 6ooy
49
- 1y0l
50
- 1riv
51
- 6pvc
52
- 2yk1
53
- 1yee
54
- 4mrf
55
- 2ddq
56
- 4lcw
57
- 1flr
58
- 3t0w
59
- 3fo2
60
- 1qyg
61
- 1vpo
62
- 2ajv
63
- 1ub5
64
- 4k3h
65
- 3cfb
66
- 3t0x
67
- 2omn
68
- 5xqw
69
- 5ln4
70
- 1mrf
71
- 6npm
72
- 1t66
73
- 1rum
74
- 5u98
75
- 5zqk
76
- 2pye
77
- 1eap
78
- 1yei
79
- 1rua
80
- 1riu
81
- 1xdd
82
- 1oar
83
- 5j6h
84
- 6dzn
85
- 1jgl
86
- 1h8s
87
- 1lo0
88
- 1f3d
89
- 3fjg
90
- 1q0y
91
- 1kn2
92
- 2bfv
93
- 6pvd
94
- 3kdm
95
- 1f4x
96
- 1b09
97
- 3if1
98
- 1mjj
99
- 6itp
100
- 35c8
101
- 2jkl
102
- 1a6v
103
- 3vri
104
- 2jkj
105
- 3fo0
106
- 1a4k
107
- 1lo2
108
- 43ca
109
- 1cfv
110
- 1a0q
111
- 6x42
112
- 1rd4
113
- 2dwe
114
- 4np3
115
- 3upr
116
- 1yeg
117
- 1kn4
118
- 6lah
119
- 1um5
120
- 1mex
121
- 6itq
122
- 3bqm
123
- 4f8l
124
- 1yek
125
- 1mj7
126
- 1rul
127
- 1um4
128
- 1fl3
129
- 4mrg
130
- 4ia6
131
- 2cgr
132
- 2cju
133
- 1ghq
134
- 1c5c
135
- 6lb2
136
- 1jgu
137
- 1fe8
138
- 7jra
139
- 4f8n
140
- 6nux
141
- 1ynk
142
- 1bfv
143
- 2pcp
144
- 2z93
145
- 1wz1
146
- 2yio
147
- 1yef
148
- 3cfd
149
- 1lo3
150
- 1q72
151
- 2o7n
152
- 3fj7
153
- 3oaz
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
resources/wehi_pains.csv DELETED
@@ -1,480 +0,0 @@
1
- "c:1:c:c(:c:c:c:1-[#6;X4]-c:2:c:c:c(:c:c:2)-[#7&H2,$([#7;!H0]-[#6;X4]),$([#7](-[#6X4])-[#6X4])])-[#7&H2,$([#7;!H0]-[#6;X4]),$([#7](-[#6X4])-[#6X4])]","<regId=anil_di_alk_F(14)>"
2
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#6]=[#7]-[#7]-[#1]","<regId=hzone_anil(14)>"
3
- "c1(nn(c([c;!H0,$(c-[#6;!H0])]1)-[#8]-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#1])-[#6;X4]","<regId=het_5_pyrazole_OH(14)>"
4
- "c:2(:c:1-[#16]-c:3:c(-[#7;!H0,$([#7]-[CH3]),$([#7]-[#6;!H0;!H1]-[#6;!H0])](-c:1:c(:c(:c:2-[#1])-[#1])-[#1])):[c;!H0,$(c~[#7](-[#1])-[#6;X4]),$(c~[#6]:[#6])](:[c;!H0,$(c~[#6]:[#6])]:[c;!H0,$(c-[#7](-[#1])-[#1]),$(c-[#8]-[#6;X4])]:c:3-[#1]))-[#1]","<regId=het_thio_666_A(13)>"
5
- "[#6]-2-[#6]-c:1:c(:c:c:c:c:1)-[#6](-c:3:c:c:c:c:c-2:3)=[#6]-[#6]","<regId=styrene_A(13)>"
6
- "[#16]-1-[#6](=[#7]-[#6]:[#6])-[#7;!H0,$([#7]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]),$([#7]-[#6]:[#6])]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[$([#6]:[#6]:[#6]-[#17]),$([#6]:[!#6&!#1])]","<regId=ene_rhod_C(13)>"
7
- "[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#6]=[#6])-[#8]-1)-[#6](-[#1])-[#1]","<regId=dhp_amino_CN_A(13)>"
8
- "[#8]=[#16](=[#8])-[#6](-[#6]#[#7])=[#7]-[#7]-[#1]","<regId=cyano_imine_C(12)>"
9
- "c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_A(12)>"
10
- "c:1:c(:c:c:c:c:1)-[#7](-[#1])-c:2:c(:c(:c(:s:2)-[$([#6]=[#8]),$([#6]#[#7]),$([#6](-[#8]-[#1])=[#6])])-[#7])-[$([#6]#[#7]),$([#6](:[#7]):[#7])]","<regId=thiophene_amino_B(12)>"
11
- "[#6;X4]-1-[#6](=[#8])-[#7]-[#7]-[#6]-1=[#8]","<regId=keto_keto_beta_B(12)>"
12
- "c:1:c-3:c(:c:c:c:1)-[#6]:2:[#7]:[!#1]:[#6]:[#6]:[#6]:2-[#6]-3=[#8]","<regId=keto_phenone_A(11)>"
13
- "[#6]-1(-[#6](=[#6](-[#6]#[#7])-[#6](~[#8])~[#7]~[#6]-1~[#8])-[#6](-[#1])-[#1])=[#6](-[#1])-[#6]:[#6]","<regId=cyano_pyridone_C(11)>"
14
- "[#6]-1(=[#6](-!@[#6]=[#7])-[#16]-[#6](-[#7]-1)=[#8])-[$([F,Cl,Br,I]),$([#7+](:[#6]):[#6])]","<regId=thiaz_ene_C(11)>"
15
- "c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1]):[!#6&!#1]:[#6;!H0,$([#6]-[OH]),$([#6]-[#6;H2,H3])](:[#6]:2-[#6](-[#1])=[#7]-[#7](-[#1])-[$([#6]:1:[#7]:[#6]:[#6](-[#1]):[#16]:1),$([#6]:[#6](-[#1]):[#6]-[#1]),$([#6]:[#7]:[#6]:[#7]:[#6]:[#7]),$([#6]:[#7]:[#7]:[#7]:[#7])])","<regId=hzone_thiophene_A(11)>"
16
- "[!#1]:[!#1]-[#6;!H0,$([#6]-[#6]#[#7])]=[#6]-1-[#6]=,:[#6]-[#6](=[$([#8]),$([#7;!R])])-[#6]=,:[#6]-1","<regId=ene_quin_methide(10)>"
17
- "c:1:c:c-2:c(:c:c:1)-[#6]-[#6](-c:3:c(-[#16]-2):c(:c(-[#1]):[c;!H0,$(c-[#8]),$(c-[#16;X2]),$(c-[#6;X4]),$(c-[#7;H2,H3,$([#7!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])])](:c:3-[#1]))-[#1])-[#7;H2,H3,$([#7;!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])]","<regId=het_thio_676_A(10)>"
18
- "[#6]-1(=[#8])-[#6](=[#6](-[#1])-[$([#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1),$([#6]:1:[#6]:[#6]:[#6]:[!#6&!#1]:1)])-[#7]=[#6](-[!#1]:[!#1]:[!#1])-[$([#16]),$([#7]-[!#1]:[!#1])]-1","<regId=ene_five_het_G(10)>"
19
- "[#7+](:[!#1]:[!#1]:[!#1])-[!#1]=[#8]","<regId=acyl_het_A(9)>"
20
- "[#6;X4]-[#7](-[#6;X4])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6]2=,:[#7][#6]:[#6]:[!#1]2)-[#1])-[#1]","<regId=anil_di_alk_G(9)>"
21
- "[#7;!H0,$([#7]-[#6;X4])]-1-[#6]=,:[#6](-[#6](=[#8])-[#6]:[#6]:[#6])-[#6](-[#6])-[#6](=[#6]-1-[#6](-[#1])(-[#1])-[#1])-[$([#6]=[#8]),$([#6]#[#7])]","<regId=dhp_keto_A(9)>"
22
- "c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_B(9)>"
23
- "c:1:3:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2)-[#1]):n:c(-[#1]):n:3-[#6]","<regId=anil_alk_bim(9)>"
24
- "c:1:c:c-2:c(:c:c:1)-[#7]=[#6]-[#6]-2=[#7;!R]","<regId=imine_imine_A(9)>"
25
- "c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6](=[#8])-[#6]-,:2:[!#1]:[!#6&!#1]:[#6]:[#6]-,:2","<regId=thio_urea_C(9)>"
26
- "[#7;!R]=[#6]-2-[#6](=[#8])-c:1:c:c:c:c:c:1-[#16]-2","<regId=imine_one_fives_B(9)>"
27
- "[$([#7](-[#1])-[#1]),$([#8]-[#1])]-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-c:1:c(:n(-[#6]):n:c:1)-[#8]-2","<regId=dhp_amino_CN_B(9)>"
28
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:n:c:1-[#1])-[#8]-c:2:c:c:c:c:c:2)-[#1])-[#1]","<regId=anil_OC_no_alk_A(8)>"
29
- "[#6](=[#8])-[#6]-1=[#6]-[#7]-c:2:c(-[#16]-1):c:c:c:c:2","<regId=het_thio_66_one(8)>"
30
- "c:1:c:c-2:c(:c:c:1)-[#6](-c:3:c(-[$([#16;X2]),$([#6;X4])]-2):c:c:[c;!H0,$(c-[#17]),$(c-[#6;X4])](:c:3))=[#6]-[#6]","<regId=styrene_B(8)>"
31
- "[#6](-[#1])(-[#1])-[#16;X2]-c:1:n:c(:c(:n:1-!@[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2)-[#1]","<regId=het_thio_5_A(8)>"
32
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2=[#6](-[#1])-c:1:c(:c:c:c:c:1)-[#16;X2]-c:3:c-2:c:c:c:c:3","<regId=anil_di_alk_ene_A(8)>"
33
- "[#16]-1-[#6](=!@[#7;!H0,$([#7]-[#7](-[#1])-[#6]:[#6])])-[#7;!H0,$([#7]-[#6]:[#7]:[#6]:[#6]:[#16])]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[#6]:[#6]-[$([#17]),$([#8]-[#6]-[#1])]","<regId=ene_rhod_D(8)>"
34
- "[#16]-1-[#6](=[#8])-[#7]-[#6](=[#16])-[#6]-1=[#6](-[#1])-[#6]:[#6]","<regId=ene_rhod_E(8)>"
35
- "c:1:c(:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#7](-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#1])-[#1])-[#1]","<regId=anil_OH_alk_A(8)>"
36
- "n1(-[#6;X4])c(c(-[#1])c(c1-[#6]:[#6])-[#1])-[#6](-[#1])-[#1]","<regId=pyrrole_C(8)>"
37
- "c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_D(8)>"
38
- "[#7](-c:1:c:c:c:c:c:1)-c2[n+]c(cs2)-c:3:c:c:c:c:c:3","<regId=thiaz_ene_D(8)>"
39
- "n:1:c:c:c(:c:1-[#6](-[#1])-[#1])-[#6](-[#1])=[#6]-2-[#6](=[#8])-[#7]-[#6](=[!#6&!#1])-[#7]-2","<regId=ene_rhod_F(8)>"
40
- "[#6]-,:1(=,:[#6](-[#6](-[#1])(-[#6])-[#6])-,:[#16]-,:[#6](-,:[#7;!H0,$([#7]-[#6;!H0;!H1])]-,:1)=[#8])-[#16]-[#6;R]","<regId=thiaz_ene_E(8)>"
41
- "[!#1]:,-1:[!#1]-,:2:[!#1](:[!#1]:[!#1]:[!#1]:,-1)-,:[#7](-[#1])-,:[#7](-,:[#6]-,:2=[#8])-[#6]","<regId=het_65_B(7)>"
42
- "c:1:c:c-2:c(:c:c:1)-[#6](=[#6](-[#6]-2=[#8])-[#6])-[#8]-[#1]","<regId=keto_keto_beta_C(7)>"
43
- "c:2:c:c:1:n:n:c(:n:c:1:c:c:2)-[#6](-[#1])(-[#1])-[#6]=[#8]","<regId=het_66_A(7)>"
44
- "c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:n:c:c:c:c:2","<regId=thio_urea_E(7)>"
45
- "[#6](-[#1])-[#6](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6]-[#6]-[#6]=[#8])-[$([#6](=[#8])-[#8]),$([#6]#[#7])])-[#6](-[#1])-[#1]","<regId=thiophene_amino_C(7)>"
46
- "[#6](-c:1:c(:c(:[c;!H0,$(c-[#6;X4])]:c:c:1-[#1])-[#1])-[#1])(-c:2:c(:c(:[c;!H0,$(c-[#17])](:c(:c:2-[#1])-[#1]))-[#1])-[#1])=[$([#7]-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]),$([#7]-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]),$([#7]-[#7](-[#1])-[#6](=[#7]-[#1])-[#7](-[#1])-[#1]),$([#6](-[#1])-[#7])]","<regId=hzone_phenone(7)>"
47
- "[#8](-[#1])-[#6](=[#8])-c:1:c:c(:c:c:c:1)-[#6]:[!#1]:[#6]-[#6](-[#1])=[#6]-2-[#6](=[!#6&!#1])-[#7]-[#6](=[!#6&!#1])-[!#6&!#1]-2","<regId=ene_rhod_G(7)>"
48
- "[#6]-1(=[#6]-[#6](-c:2:c:c(:c(:n:c-1:2)-[#7](-[#1])-[#1])-[#6]#[#7])=[#6])-[#6]#[#7]","<regId=ene_cyano_B(7)>"
49
- "[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#6]:[#6])-[#8]-1)-[#6]#[#7]","<regId=dhp_amino_CN_C(7)>"
50
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6]=[#8])-[#6;X4]-[#6]-2=[#8]","<regId=het_5_A(7)>"
51
- "[#7]-1=[#6]-[#6](-[#6](-[#7]-1)=[#16])=[#6]","<regId=ene_five_het_H(6)>"
52
- "c1(coc(c1-[#1])-[#6](=[#16])-[#7]-2-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[!#1]-[#6](-[#1])(-[#1])-[#6]-2(-[#1])-[#1])-[#1]","<regId=thio_amide_A(6)>"
53
- "[#6]=[#6](-[#6]#[#7])-[#6](=[#7]-[#1])-[#7]-[#7]","<regId=ene_cyano_C(6)>"
54
- "c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]=[#7]-[#7](-[#1])-c:2:n:c:c:s:2","<regId=hzone_furan_A(6)>"
55
- "c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-[#6]:2:[#6]:[!#1]:[#6]:[#6]:[#6]:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_H(6)>"
56
- "n2c1ccccn1c(c2-[$([#6](-[!#1])=[#6](-[#1])-[#6]:[#6]),$([#6]:[#8]:[#6])])-[#7]-[#6]:[#6]","<regId=het_65_C(6)>"
57
- "[#6]-1-[#7](-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#7]-1-[#1]","<regId=thio_urea_F(6)>"
58
- "c:1(:c:c:c:o:1)-[#6](-[#1])=!@[#6]-3-[#6](=[#8])-c:2:c:c:c:c:c:2-[!#6&!#1]-3","<regId=ene_five_het_I(6)>"
59
- "[#8]=[#6]-1-[#6;X4]-[#6]-[#6](=[#8])-c:2:c:c:c:c:c-1:2","<regId=keto_keto_gamma(5)>"
60
- "c:1:c:c-2:c(:c:c:1)-[#6](-c3cccc4noc-2c34)=[#8]","<regId=quinone_B(5)>"
61
- "[#8](-[#1])-c:1:n:c(:c:c:c:1)-[#8]-[#1]","<regId=het_6_pyridone_OH(5)>"
62
- "c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#1])-[#6]=[#7]-[#7](-[#1])-[$([#6]:[#6]),$([#6]=[#16])])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=hzone_naphth_A(5)>"
63
- "[#6]-,:1=,:[#6](-,:[#16]-,:[#6](-,:[#6]=,:[#6]-,:1)=[#16])-,:[#7]","<regId=thio_ester_A(5)>"
64
- "[#6]-1=[#6]-[#6](-[#8]-[#6]-1-[#8])(-[#8])-[#6]","<regId=ene_misc_A(5)>"
65
- "[#8]=[#6]-,:1-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:[#7]-,:1)-,:[#6]=[#8])-[#6]#[#7]","<regId=cyano_pyridone_D(5)>"
66
- "c3cn1c(nc(c1-[#7]-[#6])-c:2:c:c:c:c:n:2)cc3","<regId=het_65_Db(5)>"
67
- "[#7]-2-c:1:c:c:c:c:c:1-[#6](=[#7])-c:3:c-2:c:c:c:c:3","<regId=het_666_A(5)>"
68
- "c:1:c(:c:c:c:c:1)-[#7]-2-[#6](-[#1])-[#6](-[#1])-[#7](-[#6](-[#1])-[#6]-2-[#1])-[#16](=[#8])(=[#8])-c:3:c:c:c:c:4:n:s:n:c:3:4","<regId=diazox_sulfon_B(5)>"
69
- "c:1(:c(:c-,:2:c(:c(:c:1-[#1])-[#1])-,:[#7](-,:[#6](-,:[#7]-,:2-[#1])=[#8])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=anil_NH_alk_A(5)>"
70
- "c:1(:c(:c-3:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-c:2:c:c:c(:c:c:2)-[!#6&!#1])-[#1])-[#8]-[#6](-[#8]-3)(-[#1])-[#1])-[#1])-[#1]","<regId=sulfonamide_C(5)>"
71
- "[#6](-[#1])-[#6]:2:[#7]:[#7](-c:1:c:c:c:c:c:1):[#16]:3:[!#6&!#1]:[!#1]:[#6]:[#6]:2:3","<regId=het_thio_N_55(5)>"
72
- "[#8]=[#6]-[#6]=[#6](-[#1])-[#8]-[#1]","<regId=keto_keto_beta_D(5)>"
73
- "[#7]-,:1-,:2-,:[#6](=,:[#7]-,:[#6](=[#8])-,:[#6](=,:[#7]-,:1)-[#6](-[#1])-[#1])-,:[#16]-,:[#6](=[#6](-[#1])-[#6]:[#6])-,:[#6]-,:2=[#8]","<regId=ene_rhod_H(5)>"
74
- "[#6]:[#6]-[#6](-[#1])=[#6](-[#1])-[#6](-[#1])=[#7]-[#7](-[#6;X4])-[#6;X4]","<regId=imine_ene_A(5)>"
75
- "c:1:3:c(:c:c:c:c:1):c:2:n:n:c(-[#16]-[#6](-[#1])(-[#1])-[#6]=[#8]):n:c:2:n:3-[#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]","<regId=het_thio_656a(5)>"
76
- "n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#1])-[#1])-[#1]","<regId=pyrrole_D(5)>"
77
- "n2(-[#6]:1:[!#1]:[!#6&!#1]:[!#1]:[#6]:1-[#1])c(c(-[#1])c(c2-[#6;X4])-[#1])-[#6;X4]","<regId=pyrrole_E(5)>"
78
- "c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6]([#7;R])[#7;R]","<regId=thio_urea_G(5)>"
79
- "c:1(:c(:c(:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1])]:1)-[#1])-[#8]-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[$([#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]),$([#6](-[#1])(-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#1])])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=anisol_A(5)>"
80
- "n2(-[#6]:1:[#6](-[#6]#[#7]):[#6]:[#6]:[!#6&!#1]:1)c(c(-[#1])c(c2)-[#1])-[#1]","<regId=pyrrole_F(5)>"
81
- "[#7](-[#1])(-[#1])-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-c:1:c(:c:c:s:1)-[#8]-2","<regId=dhp_amino_CN_D(5)>"
82
- "[#7](-[#1])-c:1:n:c(:c:s:1)-c:2:c:n:c(-[#7](-[#1])-[#1]):s:2","<regId=thiazole_amine_A(4)>"
83
- "[#7]=[#6]-1-[#7](-[#1])-[#6](=[#6](-[#7]-[#1])-[#7]=[#7]-1)-[#7]-[#1]","<regId=het_6_imidate_A(4)>"
84
- "c:1:c(:c:2:c(:c:c:1):c:c:c:c:2)-[#8]-c:3:c(:c(:c(:c(:c:3-[#1])-[#1])-[#7]-[#1])-[#1])-[#1]","<regId=anil_OC_no_alk_B(4)>"
85
- "c:1:c:c-2:c(:c:c:1)-[#6]-[#16]-c3c(-[#6]-2=[#6])ccs3","<regId=styrene_C(4)>"
86
- "c:2:c:c:c:1:c(:c:c:c:1):c:c:2","<regId=azulene(4)>"
87
- "c:1(:c(:c(:c(:o:1)-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6]:[#6])-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=furan_acid_A(4)>"
88
- "[!#1]:[#6]-[#6]-,:1=,:[#6](-[#1])-,:[#6](=,:[#6](-[#6]#[#7])-,:[#6](=[#8])-,:[#7]-,:1-[#1])-[#6]:[#8]","<regId=cyano_pyridone_E(4)>"
89
- "[#6]-1-3=[#6](-[#6](-[#7]-c:2:c:c:c:c:c-1:2)(-[#6])-[#6])-[#16]-[#16]-[#6]-3=[!#1]","<regId=anil_alk_thio(4)>"
90
- "c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#6](=[#8])-c:2:c:c:c:c:c:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_I(4)>"
91
- "[#6](-[#1])(-[#1])-[#16;X2]-c:1:n:n:c(:c(:n:1)-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-c:3:c(:c(:c(:o:3)-[#1])-[#1])-[#1]","<regId=het_thio_6_furan(4)>"
92
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2=[#6]-c:1:c(:c:c:c:c:1)-[#6]-2(-[#1])-[#1]","<regId=anil_di_alk_ene_B(4)>"
93
- "[#7](-[#1])(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6](=[#8])-[#6](-[#1])-[#1])-[#7](-[#1])-[$([#7]-[#1]),$([#6]:[#6])]","<regId=imine_one_B(4)>"
94
- "c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1]):o:c:3:c(-[#1]):c(:c(-[#8]-[#6](-[#1])-[#1]):c(:c:2:3)-[#1])-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=anil_OC_alk_A(4)>"
95
- "[#16]=[#6]-,:1-,:[#7](-[#1])-,:[#6]=,:[#6]-,:[#6]-2=,:[#6]-,:1-[#6](=[#8])-[#8]-[#6]-2=[#6]-[#1]","<regId=ene_five_het_J(4)>"
96
- "n2(-c:1:c(:c:c(:c(:c:1)-[#1])-[$([#7](-[#1])-[#1]),$([#6]:[#7])])-[#1])c(c(-[#1])c(c2-[#1])-[#1])-[#1]","<regId=pyrrole_G(4)>"
97
- "n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])=[#6]-2-[#6](=[#8])-[!#6&!#1]-[#6]=,:[!#1]-2)-[#1])-[#1]","<regId=ene_five_het_K(4)>"
98
- "[#6]=[#6]-[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[#6]#[#7])=[#6]-[#7](-[#1])-[#1]","<regId=cyano_ene_amine_B(4)>"
99
- "[#6]:[#6]-[#6](=[#16;X1])-[#16;X2]-[#6](-[#1])-[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thio_ester_B(4)>"
100
- "[#8]=[#6]-3-[#6](=!@[#6](-[#1])-c:1:c:n:c:c:1)-c:2:c:c:c:c:c:2-[#7]-3","<regId=ene_five_het_L(4)>"
101
- "c:1(:[c;!H0,$(c-[#6;!H0;!H1])](:c(:c(:s:1)-[#1])-[#1]))-[#6](-[#1])=[#7]-[#7](-[#1])-c:2:c:c:c:c:c:2","<regId=hzone_thiophene_B(4)>"
102
- "[#6](-[#1])(-[#1])-[#16;X2]-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](-[#6]#[#7])-[#6](=[#8])-[#7]-1","<regId=dhp_amino_CN_E(4)>"
103
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#7](-[#1])-[#6]=[#8])-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=het_5_B(4)>"
104
- "[#6]:[#6]-[#6](-[#1])=[#6](-[#1])-[#6](-[#1])=[#7]-[#7]=[#6]","<regId=imine_imine_B(3)>"
105
- "c:1(:c:c:c(:c:c:1)-[#6](-[#1])-[#1])-c:2:c(:s:c(:n:2)-[#7](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1]","<regId=thiazole_amine_B(3)>"
106
- "[#6]-2(-[#6]=[#7]-c:1:c:c:c:c:c:1-[#7]-2)=[#6](-[#1])-[#6]=[#8]","<regId=imine_ene_one_A(3)>"
107
- "[#8](-c:1:c:c:c:c:c:1)-c:3:c:c:2:n:o:n:c:2:c:c:3","<regId=diazox_A(3)>"
108
- "[!#1]:1:[!#1]:[!#1]:[!#1](:[!#1]:[!#1]:1)-[#6](-[#1])=[#6](-[#1])-[#6](-[#7]-c:2:c:c:c:3:c(:c:2):c:c:c(:n:3)-[#7](-[#6])-[#6])=[#8]","<regId=ene_one_A(3)>"
109
- "[#7](-[#1])(-[#1])-c:1:c(:c:c:c:n:1)-[#8]-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=anil_OC_no_alk_C(3)>"
110
- "[#6]-[#16;X2]-c:1:n:c(:c:s:1)-[#1]","<regId=thiazol_SC_A(3)>"
111
- "c:1:c-3:c(:c:c:c:1)-[#7](-c:2:c:c:c:c:c:2-[#8]-3)-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=het_666_B(3)>"
112
- "c:1(:c(:c(:c(:o:1)-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#8]-[#1])-[#6]#[#6]-[#6;X4]","<regId=furan_A(3)>"
113
- "[#6]-1(-[#6](=[#6]-[#6]=[#6]-[#6]=[#6]-1)-[#7]-[#1])=[#7]-[#6]","<regId=colchicine_A(3)>"
114
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])=[#6]-[#6](=[#8])-c:1:c(-[#16;X2]):s:c(:c:1)-[$([#6]#[#7]),$([#6]=[#8])]","<regId=thiophene_C(3)>"
115
- "c:1:3:c(:c:c:c:c:1)-[#7]-2-[#6](=[#8])-[#6](=[#6](-[F,Cl,Br,I])-[#6]-2=[#8])-[#7](-[#1])-[#6]:[#6]:[#6]:[#6](-[#8]-[#6](-[#1])-[#1]):[#6]:[#6]:3","<regId=anil_OC_alk_B(3)>"
116
- "c:1-2:c(:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]=[#6]-2-[#16;X2]-[#6](-[#1])(-[#1])-[#6](=[#8])-c:3:c:c:c:c:c:3","<regId=het_thio_66_A(3)>"
117
- "[#7]-2(-c:1:c:c:c:c:c:1-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#6](-[#1])(-[#1])-[!#1]:[!#1]:[!#1]:[!#1]:[!#1])-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=rhod_sat_B(3)>"
118
- "[#7]-2(-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](=[#6](-[#1])-c:1:c:c:c:c(:c:1)-[Br])-[#6]-2=[#8]","<regId=ene_rhod_I(3)>"
119
- "c:1(:c(:c:2:c(:s:1):c:c:c:c:2)-[#6](-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=keto_thiophene(3)>"
120
- "[#7](-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])=[#7]-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#6](-[#1])-[#1])-[#6]:[#6]","<regId=imine_imine_C(3)>"
121
- "[#6]:2(:[#6](-[#6](-[#1])-[#1]):[#6]-,:1:[#6](-,:[#7]=,:[#6;!H0,$([#6]-[#16]-[#6](-[#1])-[#1])](-,:[#7](-,:[#6]-,:1=[!#6&!#1;X1])-[#6](-[#1])-[$([#6](=[#8])-[#8]),$([#6]:[#6])])):[!#6&!#1;X2]:2)-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=het_65_pyridone_A(3)>"
122
- "c:1(:n:c(:c(-[#1]):s:1)-[!#1]:[!#1]:[!#1](-[$([#8]-[#6](-[#1])-[#1]),$([#6](-[#1])-[#1])]):[!#1]:[!#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c(-[#1]):c(:c(-[#1]):o:2)-[#1]","<regId=thiazole_amine_C(3)>"
123
- "n:1:c(:c(:c(:c(:c:1-[#16]-[#6]-[#1])-[#6]#[#7])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])-[#1])-[#1])-[#6]:[#6]","<regId=het_thio_pyr_A(3)>"
124
- "c:1:4:c(:n:c(:n:c:1-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-[#7](-[#1])-c:3:c:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])](:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])](:c:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])]:3))):c:c:c:c:4","<regId=melamine_A(3)>"
125
- "[#7](-[#1])(-[#6]:1:[#6]:[#6]:[!#1]:[#6]:[#6]:1)-c:2:c:c:c(:c:c:2)-[#7](-[#1])-[#6]-[#1]","<regId=anil_NH_alk_B(3)>"
126
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#7]-[#6]=[#8])-[#16]-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=rhod_sat_C(3)>"
127
- "[#6]=[#6]-[#6](=[#8])-[#7]-c:1:c(:c(:c(:s:1)-[#6](=[#8])-[#8])-[#6]-[#1])-[#6]#[#7]","<regId=thiophene_amino_D(3)>"
128
- "[#8;!H0,$([#8]-[#6](-[#1])-[#1])]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:n:c:c:n:2","<regId=anil_OC_alk_C(3)>"
129
- "[#6](-[#1])(-[#1])-[#16;X2]-c3nc1c(n(nc1-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2)nn3","<regId=het_thio_65_A(3)>"
130
- "[#6]-[#6](=[#8])-[#6](-[#1])(-[#1])-[#16;X2]-c:3:n:n:c:2:c:1:c(:c(:c(:c(:c:1:n(:c:2:n:3)-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=het_thio_656b(3)>"
131
- "s:1:c(:[n+](-[#6](-[#1])-[#1]):c(:c:1-[#1])-[#6])-[#7](-[#1])-c:2:c:c:c:c:c:2[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thiazole_amine_D(3)>"
132
- "[#6]-,:2(=[#16])-,:[#7](-[#6](-[#1])(-[#1])-c:1:c:c:c:o:1)-,:[#6](=,:[#7]-,:[#7]-,:2-[#1])-[#6]:[#6]","<regId=thio_urea_H(3)>"
133
- "[#7]-,:2(-c:1:c:c:c:c:c:1)-,:[#6](=[#8])-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:2)-[#6]#[#7])-[#6]#[#7]","<regId=cyano_pyridone_F(3)>"
134
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6])-[#6]-2=[#8]","<regId=rhod_sat_D(3)>"
135
- "[#6](-[#1])(-[#1])-[#7]-2-[#6](=[$([#16]),$([#7])])-[!#6&!#1]-[#6](=[#6]-1-[#6](=[#6](-[#1])-[#6]:[#6]-[#7]-1-[#6](-[#1])-[#1])-[#1])-[#6]-2=[#8]","<regId=ene_rhod_J(3)>"
136
- "[#6]=[#7;!R]-c:1:c:c:c:c:c:1-[#8]-[#1]","<regId=imine_phenol_A(3)>"
137
- "[#8]=[#6]-,:2-,:[#16]-,:c:1:c(:c(:c:c:c:1)-[#8]-[#6](-[#1])-[#1])-,:[#8]-,:2","<regId=thio_carbonate_B(3)>"
138
- "[#7]=,:[#6]-,:1-,:[#7]=,:[#6]-,:[#7]-,:[#16]-,:1","<regId=het_thio_N_5A(3)>"
139
- "[#7]-,:2-,:[#16]-,:[#6]-1=,:[#6](-[#6]:[#6]-[#7]-[#6]-1)-,:[#6]-,:2=[#16]","<regId=het_thio_N_65A(3)>"
140
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])=[#7]-[#7]=[#6](-[#6])-[#6]:[#6])-[#1])-[#1]","<regId=anil_di_alk_J(3)>"
141
- "n1-2cccc1-[#6]=[#7](-[#6])-[#6]-[#6]-2","<regId=pyrrole_H(3)>"
142
- "[#6](-[#6]#[#7])(-[#6]#[#7])=[#6](-[#16])-[#16]","<regId=ene_cyano_D(3)>"
143
- "[#6]-1(-[#6]#[#7])(-[#6]#[#7])-[#6](-[#1])(-[#6](=[#8])-[#6])-[#6]-1-[#1]","<regId=cyano_cyano_B(3)>"
144
- "[#6]-1=,:[#6]-[#6](-[#6](-[$([#8]),$([#16])]-1)=[#6]-[#6]=[#8])=[#8]","<regId=ene_five_het_M(3)>"
145
- "[#6]:[#6]-[#6](=[#8])-[#7](-[#1])-[#6](=[#8])-[#6](-[#6]#[#7])=[#6](-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=cyano_ene_amine_C(3)>"
146
- "c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#7]=[#6]-c:2:c:n:c:c:2","<regId=thio_urea_I(3)>"
147
- "[#7](-[#1])(-[#1])-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-c:1:c:c:c:s:1)-[#6](=[#6](-[#6](-[#1])-[#1])-[#8]-2)-[#6](=[#8])-[#8]-[#6]","<regId=dhp_amino_CN_F(3)>"
148
- "c:1:c-3:c(:c:c(:c:1)-[#6](=[#8])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#6](=[#8])-[#8]-[#1])-[#6](-[#7](-[#6]-3=[#8])-[#6](-[#1])-[#1])=[#8]","<regId=anthranil_acid_B(3)>"
149
- "[Cl]-c:2:c:c:1:n:o:n:c:1:c:c:2","<regId=diazox_B(3)>"
150
- "[#6]-[#6](=[#16])-[#1]","<regId=thio_aldehyd_A(3)>"
151
- "[#6;X4]-[#7](-[#1])-[#6](-[#6]:[#6])=[#6](-[#1])-[#6](=[#16])-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=thio_amide_B(2)>"
152
- "[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#16]-[#6](-[#1])(-[#1])-c1cn(cn1)-[#1]","<regId=imidazole_B(2)>"
153
- "[#8]=[#6]-[#7](-[#1])-c:1:c(-[#6]:[#6]):n:c(-[#6](-[#1])(-[#1])-[#6]#[#7]):s:1","<regId=thiazole_amine_E(2)>"
154
- "[#6](-[#1])-[#7](-[#1])-c:1:n:c(:c:s:1)-c2cnc3n2ccs3","<regId=thiazole_amine_F(2)>"
155
- "[#7]-,:1-,:[#6](=[#8])-,:[#6](=,:[#6](-[#6])-,:[#16]-,:[#6]-,:1=[#16])-[#1]","<regId=thio_ester_C(2)>"
156
- "[#6](-[#16])(-[#7])=[#6](-[#1])-[#6]=[#6](-[#1])-[#6]=[#8]","<regId=ene_one_B(2)>"
157
- "[#8]=[#6]-3-c:1:c(:c:c:c:c:1)-[#6]-2=[#6](-[#8]-[#1])-[#6](=[#8])-[#7]-c:4:c-2:c-3:c:c:c:4","<regId=quinone_C(2)>"
158
- "c:1:2:c:c:c:c(:c:1:c(:c:c:c:2)-[$([#8]-[#1]),$([#7](-[#1])-[#1])])-[#6](-[#6])=[#8]","<regId=keto_naphthol_A(2)>"
159
- "[#6](-[#1])(-c:1:c:c:c:c:c:1)(-c:2:c:c:c:c:c:2)-[#6](=[#16])-[#7]-[#1]","<regId=thio_amide_C(2)>"
160
- "[#7]-2(-[#6](=[#8])-c:1:c(:c(:c(:c(:c:1-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1])-[#6]-2=[#8])-c:3:c(:c:c(:c(:c:3)-[#1])-[#8])-[#1]","<regId=phthalimide_misc(2)>"
161
- "c:1:c:c(:c:c:c:1-[#7](-[#1])-[#16](=[#8])=[#8])-[#7](-[#1])-[#16](=[#8])=[#8]","<regId=sulfonamide_D(2)>"
162
- "[#6](-[#1])-[#7](-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6]-[#1]","<regId=anil_NH_alk_C(2)>"
163
- "s1c(c(c-,:2c1-,:[#7](-[#1])-,:[#6](-,:[#6](=,:[#6]-,:2-[#1])-[#6](=[#8])-[#8]-[#1])=[#8])-[#7](-[#1])-[#1])-[#6](=[#8])-[#7]-[#1]","<regId=het_65_E(2)>"
164
- "c:2(:c:1:c(:c(:c(:c(:c:1:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#7](-[#1])-[#6]=[#8])-[#1])-[#1])-[#1]","<regId=hzide_naphth(2)>"
165
- "[#6](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6;X4])-[#1]","<regId=anisol_B(2)>"
166
- "[#6]-1=[#6]-[#7]-[#6](-[#16]-[#6;X4]-1)=[#16]","<regId=thio_carbam_ene(2)>"
167
- "[#6](-[#7](-[#6]-[#1])-[#6]-[#1]):[#6]-[#7](-[#1])-[#6](=[#16])-[#6]-[#1]","<regId=thio_amide_D(2)>"
168
- "n2nc(c1cccc1c2-[#6])-[#6]","<regId=het_65_Da(2)>"
169
- "s:1:c(:c(-[#1]):c(:c:1-[#6](=[#8])-[#7](-[#1])-[#7]-[#1])-[#8]-[#6](-[#1])-[#1])-[#1]","<regId=thiophene_D(2)>"
170
- "[#6]-1:[#6]-[#7]=[#6]-[#6](=[#6]-[#7]-[#6])-[#16]-1","<regId=het_thio_6_ene(2)>"
171
- "[#6](-[#1])(-[#1])-[#6](-[#1])(-[#6]#[#7])-[#6](=[#8])-[#6]","<regId=cyano_keto_A(2)>"
172
- "c2(c(-[#7](-[#1])-[#1])n(-c:1:c:c:c:c:c:1-[#6](=[#8])-[#8]-[#1])nc2-[#6]=[#8])-[$([#6]#[#7]),$([#6]=[#16])]","<regId=anthranil_acid_C(2)>"
173
- "c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7](-,:[#7]=,:[#6]-,:3)-[#1]","<regId=naphth_amino_C(2)>"
174
- "c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7]-,:[#7]=,:[#7]-,:3","<regId=naphth_amino_D(2)>"
175
- "c1csc(n1)-[#7]-[#7]-[#16](=[#8])=[#8]","<regId=thiazole_amine_G(2)>"
176
- "c:1:c:c:c:2:c(:c:1):n:c(:n:c:2)-[#7](-[#1])-[#6]-3=[#7]-[#6](-[#6]=[#6]-[#7]-3-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=het_66_B(2)>"
177
- "c:1-,:3:c(:c(:c(:c(:c:1)-[#8]-[#6]-[#1])-[#1])-[#1])-,:c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-,:[#6](=[#8])-,:[#8]-,:3","<regId=coumarin_A(2)>"
178
- "c:12:c(:c:c:c:n:1)c(c(-[#6](=[#8])~[#8;X1])s2)-[#7](-[#1])-[#1]","<regId=anthranil_acid_D(2)>"
179
- "c:1:2:n:c(:c(:n:c:1:[#6]:[#6]:[#6]:[!#1]:2)-[#6](-[#1])=[#6](-[#8]-[#1])-[#6])-[#6](-[#1])=[#6](-[#8]-[#1])-[#6]","<regId=het_66_C(2)>"
180
- "c1csc(c1-[#7](-[#1])-[#1])-[#6](-[#1])=[#6](-[#1])-c2cccs2","<regId=thiophene_amino_E(2)>"
181
- "c:2:c:c:1:n:c:3:c(:n:c:1:c:c:2):c:c:c:4:c:3:c:c:c:c:4","<regId=het_6666_A(2)>"
182
- "[#6]:[#6]-[#7](-[#1])-[#16](=[#8])(=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=sulfonamide_E(2)>"
183
- "c:1:c:c(:c:c:c:1-[#7](-[#1])-[#1])-[#7](-[#6;X3])-[#6;X3]","<regId=anil_di_alk_K(2)>"
184
- "[#7]-2=[#6](-c:1:c:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#6](-[#8]-[#1])(-[#6](-[#9])(-[#9])-[#9])-[#7]-2-[$([#6]:[#6]:[#6]:[#6]:[#6]:[#6]),$([#6](=[#16])-[#6]:[#6]:[#6]:[#6]:[#6]:[#6])]","<regId=het_5_C(2)>"
185
- "c:1:c(:c:c:c:c:1)-[#6](=[#8])-[#6](-[#1])=[#6]-,:3-,:[#6](=[#8])-,:[#7](-[#1])-,:[#6](=[#8])-,:[#6](=[#6](-[#1])-c:2:c:c:c:c:c:2)-,:[#7]-,:3-[#1]","<regId=ene_six_het_B(2)>"
186
- "[#8]=[#6]-4-[#6]-[#6]-[#6]-3-[#6]-2-[#6](=[#8])-[#6]-[#6]-1-[#6]-[#6]-[#6]-[#6]-1-[#6]-2-[#6]-[#6]-[#6]-3=[#6]-4","<regId=steroid_A(2)>"
187
- "c:1:2:c:3:c(:c(-[#8]-[#1]):c(:c:1:c(:c:n:2-[#6])-[#6]=[#8])-[#1]):n:c:n:3","<regId=het_565_A(2)>"
188
- "[#6;X4]-[#7+](-[#6;X4]-[#8]-[#1])=[#6]-[#16]-[#6]-[#1]","<regId=thio_imine_ium(2)>"
189
- "[#6]-3(=[#8])-[#6](=[#6](-[#1])-[#7](-[#1])-c:1:c:c:c:c:c:1-[#6](=[#8])-[#8]-[#1])-[#7]=[#6](-c:2:c:c:c:c:c:2)-[#8]-3","<regId=anthranil_acid_E(2)>"
190
- "c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]=[#7]-[#7](-[#1])-c:2:c:c:n:c:c:2","<regId=hzone_furan_B(2)>"
191
- "c:1(:c(:c(:[c;!H0,$(c-[#6;!H0,!H1])](:s:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]-[#6](=[#8])-[#7](-[#1])-c:2:n:c:c:s:2","<regId=thiophene_E(2)>"
192
- "[#6]:[#6]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#6]=[#8])-[#7]-2-[#6](=[#8])-[#6]-1(-[#1])-[#6](-[#1])(-[#1])-[#6]=[#6]-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#6]-2=[#8]","<regId=ene_misc_B(2)>"
193
- "[#6]-1(-[#6]=[#8])(-[#6]:[#6])-[#16;X2]-[#6]=[#7]-[#7]-1-[#1]","<regId=het_thio_5_B(2)>"
194
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-c:2:c:c:c:c:c:2)-[#6]#[#7])-[#6]:3:[!#1]:[!#1]:[!#1]:[!#1]:[!#1]:3","<regId=thiophene_amino_F(2)>"
195
- "[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2-[$([#6](-[#1])-[#1]),$([#8]-[#6](-[#1])-[#1])]","<regId=anil_OC_alk_D(2)>"
196
- "[#6](-[#1])(-[#1])(-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-c:1:c(:c:c(:c(:c:1-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6](-[#1])-[#7])-[#1]","<regId=tert_butyl_A(2)>"
197
- "c:1(:c(:o:c:c:1)-[#6]-[#1])-[#6]=[#7]-[#7](-[#1])-[#6](=[#16])-[#7]-[#1]","<regId=thio_urea_J(2)>"
198
- "[#7](-[#1])-c1nc(nc2nnc(n12)-[#16]-[#6])-[#7](-[#1])-[#6]","<regId=het_thio_65_B(2)>"
199
- "c:1-,:2:c(:c:c:c:c:1-[#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1])-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-[#6]:[#6])-,:[#6](=[#8])-,:[#8]-,:2)-[#1]","<regId=coumarin_B(2)>"
200
- "[#6]-2(=[#16])-[#7]-1-[#6]:[#6]-[#7]=[#7]-[#6]-1=[#7]-[#7]-2-[#1]","<regId=thio_urea_K(2)>"
201
- "[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-c:1:c:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6])-[#6](=[#8])-[#8]-[#1]","<regId=thiophene_amino_G(2)>"
202
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c:c:1-[#7](-[#1])-[#6](-[#1])(-[#6])-[#6](-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_NH_alk_D(2)>"
203
- "[#16]=[#6]-,:2-,:[#7](-[#1])-,:[#7]=,:[#6](-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-,:[#8]-,:2","<regId=het_thio_5_C(2)>"
204
- "[#16]=[#6]-c:1:c:c:c:2:c:c:c:c:n:1:2","<regId=thio_keto_het(2)>"
205
- "[#6]~1~[#6](~[#7]~[#7]~[#6](~[#6](-[#1])-[#1])~[#6](-[#1])-[#1])~[#7]~[#16]~[#6]~1","<regId=het_thio_N_5B(2)>"
206
- "[#6]-1(-[#6]=,:[#6]-[#6]=,:[#6]-[#6]-1=[!#6&!#1])=[!#6&!#1]","<regId=quinone_D(2)>"
207
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(-[#1]):c(:c(:o:1)-[#6](-[#1])=[#6]-[#6]#[#7])-[#1]","<regId=anil_di_alk_furan_B(2)>"
208
- "[#8]=[#6]-1-[#6]:[#6]-[#6](-[#1])(-[#1])-[#7]-[#6]-1=[#6]-[#1]","<regId=ene_six_het_C(2)>"
209
- "[#6]:[#6]-[#7]:2:[#7]:[#6]:1-[#6](-[#1])(-[#1])-[#16;X2]-[#6](-[#1])(-[#1])-[#6]:1:[#6]:2-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])=[#6]-[#1]","<regId=het_55_A(2)>"
210
- "n:1:c(:n(:c:2:c:1:c:c:c:c:2)-[#6](-[#1])-[#1])-[#16]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6](-[#1])-[#6](-[#1])=[#6]-[#1]","<regId=het_thio_65_C(2)>"
211
- "c:1(:c:c(:c(:c:c:1)-[#8]-[#1])-[#6](=!@[#6]-[#7])-[#6]=[#8])-[#8]-[#1]","<regId=hydroquin_A(2)>"
212
- "c:1(:c:c(:c(:c:c:1)-[#7](-[#1])-[#6](=[#8])-[#6]:[#6])-[#6](=[#8])-[#8]-[#1])-[#8]-[#1]","<regId=anthranil_acid_F(2)>"
213
- "n2(-[#6](-[#1])-[#1])c-1c(-[#6]:[#6]-[#6]-1=[#8])cc2-[#6](-[#1])-[#1]","<regId=pyrrole_I(2)>"
214
- "[#6](-[#1])-[#7](-[#1])-c:1:c(:c(:c(:s:1)-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=thiophene_amino_H(2)>"
215
- "[#6]:[#6]-[#7;!R]=[#6]-2-[#6](=[!#6&!#1])-c:1:c:c:c:c:c:1-[#7]-2","<regId=imine_one_fives_C(2)>"
216
- "c:1:c:c:c:c:c:1-[#6](=[#8])-[#7](-[#1])-[#7]=[#6]-3-c:2:c:c:c:c:c:2-c:4:c:c:c:c:c-3:4","<regId=keto_phenone_zone_A(2)>"
217
- "c:1:c(:c:c:c:c:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])=[#6](-[#1])-[#6]=!@[#6](-[#1])-[#6](-[#1])=[#6]-[#6]=@[#7]-c:2:c:c:c:c:c:2","<regId=dyes7A(2)>"
218
- "[#6]:1:2:[!#1]:[#7+](:[!#1]:[#6;!H0,$([#6]-[*])](:[!#1]:1:[#6]:[#6]:[#6]:[#6]:2))~[#6]:[#6]","<regId=het_pyridiniums_B(2)>"
219
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#16]-[#6])-[#6]-2=[#8]","<regId=het_5_D(2)>"
220
- "c:1:c:c:c(:c:c:1-[#7](-[#1])-c2nc(c(-[#1])s2)-c:3:c:c:c(:c:c:3)-[#6](-[#1])(-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=thiazole_amine_H(1)>"
221
- "[#6](-[#1])(-[#1])-[#7](-[#1])-[#6]=[#7]-[#7](-[#1])-c1nc(c(-[#1])s1)-[#6]:[#6]","<regId=thiazole_amine_I(1)>"
222
- "[#6]:[#6]-[#7](-[#1])-[#6](=[#8])-c1c(snn1)-[#7](-[#1])-[#6]:[#6]","<regId=het_thio_N_5C(1)>"
223
- "[#8]=[#16](=[#8])(-[#6]:[#6])-[#7](-[#1])-c1nc(cs1)-[#6]:[#6]","<regId=sulfonamide_F(1)>"
224
- "[#8]=[#16](=[#8])(-[#6]:[#6])-[#7](-[#1])-[#7](-[#1])-c1nc(cs1)-[#6]:[#6]","<regId=thiazole_amine_J(1)>"
225
- "s2c:1:n:c:n:c(:c:1c(c2-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7]-[#7]=[#6]-c3ccco3","<regId=het_65_F(1)>"
226
- "[#6](=[#8])-[#6](-[#1])=[#6](-[#8]-[#1])-[#6](-[#8]-[#1])=[#6](-[#1])-[#6](=[#8])-[#6]","<regId=keto_keto_beta_E(1)>"
227
- "c:2(:c:1-[#6](-[#6](-[#6](-c:1:c(:c(:c:2-[#1])-[#1])-[#1])(-[#1])-[#1])=[#8])=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]","<regId=ene_five_one_B(1)>"
228
- "[#6]:[#6]-[#7](-[#1])-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#1])-[#6]:[#6]","<regId=keto_keto_beta_zone(1)>"
229
- "[#6;X4]-[#16;X2]-[#6](=[#7]-[!#1]:[!#1]:[!#1]:[!#1])-[#7](-[#1])-[#7]=[#6]","<regId=thio_urea_L(1)>"
230
- "[#6]-1(=[#7]-[#7](-[#6](-[#16]-1)=[#6](-[#1])-[#6]:[#6])-[#6]:[#6])-[#6]=[#8]","<regId=het_thio_urea_ene(1)>"
231
- "c:1(:c(:c:2:c(:n:c:1-[#7](-[#1])-[#1]):c:c:c(:c:2-[#7](-[#1])-[#1])-[#6]#[#7])-[#6]#[#7])-[#6]#[#7]","<regId=cyano_amino_het_A(1)>"
232
- "[!#1]:1:[!#1]:[!#1]:[!#1](:[!#1]:[!#1]:1)-[#6](-[#1])=[#6](-[#1])-[#6](-[#7](-[#1])-[#7](-[#1])-c2nnnn2-[#6])=[#8]","<regId=tetrazole_hzide(1)>"
233
- "c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#1])-[#6](=[#7]-[#6]:[#6])-[#6](-[#1])-[#1])-[#8]-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=imine_naphthol_A(1)>"
234
- "c:1(:c(:c:2:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1]):c(:c(:c(:c:2-[#7](-[#1])-[#6](-[#1])(-[#1])-[#1])-[#1])-c:3:c(:c(:c(:c(:c:3-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=misc_anisole_A(1)>"
235
- "c:1:c:c-2:c(:c:c:1)-[#16]-c3c(-[#7]-2)cc(s3)-[#6](-[#1])-[#1]","<regId=het_thio_665(1)>"
236
- "c:1:c:c:c-2:c(:c:1)-[#6](-[#6](-[#7]-2-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-4-[#6](-c:3:c:c:c:c:c:3-[#6]-4=[#8])=[#8])(-[#1])-[#1])(-[#1])-[#1]","<regId=anil_di_alk_L(1)>"
237
- "c:1(:c:c:c(:c:c:1)-[#6]-,:3=,:[#6]-,:[#6](-,:c2cocc2-,:[#6](=,:[#6]-,:3)-[#8]-[#1])=[#8])-[#16]-[#6](-[#1])-[#1]","<regId=colchicine_B(1)>"
238
- "[#6;X4]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#16]-[#6](-[#1])(-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1]","<regId=misc_aminoacid_A(1)>"
239
- "n:1:c(:n(:c(:c:1-c:2:c:c:c:c:c:2)-c:3:c:c:c:c:c:3)-[#7]=!@[#6])-[#7](-[#1])-[#1]","<regId=imidazole_amino_A(1)>"
240
- "[#6](-c:1:c:c:c(:c:c:1)-[#8]-[#1])(-c:2:c:c:c(:c:c:2)-[#8]-[#1])-[#8]-[#16](=[#8])=[#8]","<regId=phenol_sulfite_A(1)>"
241
- "c:2:c:c:1:n:c(:c(:n:c:1:c:c:2)-[#6](-[#1])(-[#1])-[#6](=[#8])-[#6]:[#6])-[#6](-[#1])(-[#1])-[#6](=[#8])-[#6]:[#6]","<regId=het_66_D(1)>"
242
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c(-[#6](-[#1])-[#1])c:c:2","<regId=misc_anisole_B(1)>"
243
- "[#6](-[#1])(-[#1])-c1nnnn1-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#1])-[#1]","<regId=tetrazole_A(1)>"
244
- "[#6]-2(=[#7]-c1c(c(nn1-[#6](-[#6]-2(-[#1])-[#1])=[#8])-[#7](-[#1])-[#1])-[#7](-[#1])-[#1])-[#6]","<regId=het_65_G(1)>"
245
- "[#6](-[#6]:[#6])(-[#6]:[#6])(-[#6]:[#6])-[#16]-[#6]:[#6]-[#6](=[#8])-[#8]-[#1]","<regId=misc_trityl_A(1)>"
246
- "[#8]=[#6](-c:1:c(:c(:n:c(:c:1-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=misc_pyridine_OC(1)>"
247
- "[#7]-1=[#6](-[#7](-[#6](-[#6](-[#6]-1(-[#1])-[#6]:[#6])(-[#1])-[#1])=[#8])-[#1])-[#7]-[#1]","<regId=het_6_hydropyridone(1)>"
248
- "[#6]-1(=[#6](-[#6](-[#6](-[#6](-[#6]-1(-[#1])-[#1])(-[#1])-[#6](=[#8])-[#6])(-[#1])-[#6](=[#8])-[#8]-[#1])(-[#1])-[#1])-[#6]:[#6])-[#6]:[#6]","<regId=misc_stilbene(1)>"
249
- "[#6](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[Cl])-[#1])-[#1])(-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[Cl])-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c3nc(c(n3-[#6](-[#1])(-[#1])-[#1])-[#1])-[#1]","<regId=misc_imidazole(1)>"
250
- "n:1:c(:c(:c(:c(:c:1-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=anil_NH_no_alk_A(1)>"
251
- "[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#8]-[#1])-[#6]-2=[#6](-[#8]-[#6](-[#7]=[#7]-2)=[#7])-[#7](-[#1])-[#1]","<regId=het_6_imidate_B(1)>"
252
- "[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=anil_alk_B(1)>"
253
- "c:1:c:c-3:c(:c:c:1)-c:2:c:c:c(:c:c:2-[#6]-3=[#6](-[#1])-[#6])-[#7](-[#1])-[#1]","<regId=styrene_anil_A(1)>"
254
- "c:1:c:c-2:c(:c:c:1)-[#7](-[#6](-[#8]-[#6]-2)(-[#6](=[#8])-[#8]-[#1])-[#6](-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])-[#1]","<regId=misc_aminal_acid(1)>"
255
- "n:1:c(:c(:c(:c(:c:1-[#7](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_no_alk_D(1)>"
256
- "[#7](-[#1])(-c:1:c:c:c:c:c:1)-[#6](-[#6])(-[#6])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=anil_alk_C(1)>"
257
- "[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6]-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])(-[#1])-[#1])-[#6]:[#6]","<regId=misc_anisole_C(1)>"
258
- "c:1-2:c:c-3:c(:c:c:1-[#8]-[#6]-[#8]-2)-[#6]-[#6]-3","<regId=het_465_misc(1)>"
259
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=anthranil_acid_G(1)>"
260
- "c:1(:c:4:c(:n:c(:c:1-[#6](-[#1])(-[#1])-[#7]-3-c:2:c(:c(:c(:c(:c:2-[#6](-[#1])(-[#1])-[#6]-3(-[#1])-[#1])-[#1])-[#1])-[#1])-[#1])-[#1]):c(:c(:c(:c:4-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_M(1)>"
261
- "c:1:c(:c2:c(:c:c:1)c(c(n2-[#1])-[#6]:[#6])-[#6]:[#6])-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_H(1)>"
262
- "[#6]:[#6]-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-c:1:c(:c(:c(:c(:c:1-[F,Cl,Br,I])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_M(1)>"
263
- "n:1:c3:c(:c:c2:c:1nc(s2)-[#7])sc(n3)-[#7]","<regId=thiazole_amine_K(1)>"
264
- "[#7]=[#6]-1-[#16]-[#6](=[#7])-[#7]=[#6]-1","<regId=het_thio_5_imine_A(1)>"
265
- "c:1:c(:n:c:c:c:1)-[#6](=[#16])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#8]-[#6](-[#1])-[#1]","<regId=thio_amide_E(1)>"
266
- "c:1-2:c(:c(:c(:c(:c:1-[#6](-c:3:c(-[#16]-[#6]-2(-[#1])-[#1]):c(:c(-[#1]):c(:c:3-[#1])-[#1])-[#1])-[#8]-[#6]:[#6])-[#1])-[#1])-[#1])-[#1]","<regId=het_thio_676_B(1)>"
267
- "[#6](-[#1])(-[#1])(-[#1])-c:1:c(:c(:c(:c(:n:1)-[#7](-[#1])-[#16](-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])(=[#8])=[#8])-[#1])-[#1])-[#1]","<regId=sulfonamide_G(1)>"
268
- "[#6](=[#8])(-[#7]-1-[#6]-[#6]-[#16]-[#6]-[#6]-1)-c:2:c(:c(:c(:c(:c:2-[#16]-[#6](-[#1])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=thio_thiomorph_Z(1)>"
269
- "c:1:c:c:3:c:2:c(:c:1)-[#6](-[#6]=[#6](-c:2:c:c:c:3)-[#8]-[#6](-[#1])-[#1])=[#8]","<regId=naphth_ene_one_A(1)>"
270
- "c:1-3:c:2:c(:c(:c:c:1)-[#7]):c:c:c:c:2-[#6](-[#6]=[#6]-3-[#6](-[F])(-[F])-[F])=[#8]","<regId=naphth_ene_one_B(1)>"
271
- "c:1:c:c:c:c:2:c:1:c:c:3:c(:n:2):n:c:4:c(:c:3-[#7]):c:c:c:c:4","<regId=amino_acridine_A(1)>"
272
- "c:1:c-3:c(:c:c:c:1)-[#6]-2=[#7]-[!#1]=[#6]-[#6]-[#6]-2-[#6]-3=[#8]","<regId=keto_phenone_B(1)>"
273
- "c:1-3:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#7]-[#7](-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1])-c:4:c-3:c(:c(:c(:c:4-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=hzone_acid_A(1)>"
274
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#16](=[#8])(=[#8])-[#7](-[#1])-c:2:n:n:c(:c(:c:2-[#1])-[#1])-[#1]","<regId=sulfonamide_H(1)>"
275
- "c2(c(-[#1])n(-[#6](-[#1])-[#1])c:3:c(:c(:c:1n(c(c(c:1:c2:3)-[#1])-[#1])-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1]","<regId=het_565_indole(1)>"
276
- "c1(c-2c(c(n1-[#6](-[#8])=[#8])-[#6](-[#1])-[#1])-[#16]-[#6](-[#1])(-[#1])-[#16]-2)-[#6](-[#1])-[#1]","<regId=pyrrole_J(1)>"
277
- "s1ccnc1-c2c(n(nc2-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=pyrazole_amino_B(1)>"
278
- "c1(c(c(c(n1-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=pyrrole_K(1)>"
279
- "c:1:2(:c(:c(:c(:o:1)-[#6])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6](-[#1]):[#6](-[#1]):[#6](-[#1]):[#6](-[#1]):[#6]:2-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_I(1)>"
280
- "[!#1]:[#6]-[#6](=[#16])-[#7](-[#1])-[#7](-[#1])-[#6]:[!#1]","<regId=thio_amide_F(1)>"
281
- "[#6]-1(=[#8])-[#6](-[#6](-[#6]#[#7])=[#6](-[#1])-[#7])-[#6](-[#7])-[#6]=[#6]-1","<regId=ene_one_C(1)>"
282
- "c2(c-1n(-[#6](-[#6]=[#6]-[#7]-1)=[#8])nc2-c3cccn3)-[#6]#[#7]","<regId=het_65_H(1)>"
283
- "[#8]=[#6]-1-[#6](=[#7]-[#7]-[#6]-[#6]-1)-[#6]#[#7]","<regId=cyano_imine_D(1)>"
284
- "c:2(:c:1:c:c:c:c:c:1:n:n:c:2)-[#6](-[#6]:[#6])-[#6]#[#7]","<regId=cyano_misc_A(1)>"
285
- "c:1:c:c-2:c(:c:c:1)-[#6]=[#6]-[#6](-[#7]-2-[#6](=[#8])-[#7](-[#1])-c:3:c:c(:c(:c:c:3)-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=ene_misc_C(1)>"
286
- "c:2:c:c:1:n:c(:c(:n:c:1:c:c:2)-c:3:c:c:c:c:c:3)-c:4:c:c:c:c:c:4-[#8]-[#1]","<regId=het_66_E(1)>"
287
- "[#6](-[#1])(-[#1])-[#6](-[#8]-[#1])=[#6](-[#6](=[#8])-[#6](-[#1])-[#1])-[#6](-[#1])-[#6]#[#6]","<regId=keto_keto_beta_F(1)>"
288
- "c:1:c:4:c(:c:c2:c:1nc(n2-[#1])-[#6]-[#8]-[#6](=[#8])-c:3:c:c(:c:c(:c:3)-[#7](-[#1])-[#1])-[#7](-[#1])-[#1]):c:c:c:c:4","<regId=misc_naphthimidazole(1)>"
289
- "c:2(:c:1:c:c:c:c-3:c:1:c(:c:c:2)-[#6]=[#6]-[#6]-3=[#7])-[#7]","<regId=naphth_ene_one_C(1)>"
290
- "c:2(:c:1:c:c:c:c:c:1:c-3:c(:c:2)-[#6](-c:4:c:c:c:c:c-3:4)=[#8])-[#8]-[#1]","<regId=keto_phenone_C(1)>"
291
- "[#6]-,:2(-,:[#6]=,:[#7]-,:c:1:c:c(:c:c:c:1-,:[#8]-,:2)-[Cl])=[#8]","<regId=coumarin_C(1)>"
292
- "[#6]-1=[#6]-[#7](-[#6](-c:2:c-1:c:c:c:c:2)(-[#6]#[#7])-[#6](=[#16])-[#16])-[#6]=[#8]","<regId=thio_est_cyano_A(1)>"
293
- "c2(nc:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])n2-[#6])-[#7](-[#1])-[#6](-[#7](-[#1])-c:3:c(:c:c:c:c:3-[#1])-[#1])=[#8]","<regId=het_65_imidazole(1)>"
294
- "[#7](-[#1])(-[#6]:[#6])-c:1:c(-[#6](=[#8])-[#8]-[#1]):c:c:c(:n:1)-,:[#6]:[#6]","<regId=anthranil_acid_J(1)>"
295
- "c:1-3:c(:c:c:c:c:1)-[#16]-[#6](=[#7]-[#7]=[#6]-2-[#6]=[#6]-[#6]=[#6]-[#6]=[#6]-2)-[#7]-3-[#6](-[#1])-[#1]","<regId=colchicine_het(1)>"
296
- "c:1-2:c(:c(:c(:c(:c:1-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#6](-[#6])-[#16]-[#6]-2(-[#1])-[#1])-[#6]","<regId=ene_misc_D(1)>"
297
- "c:12:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])c(c(-[#6]:[#6])n2-!@[#6]:[#6])-[#6](-[#1])-[#1]","<regId=indole_3yl_alk_B(1)>"
298
- "[#7](-[#1])(-[#1])-c:1:c:c:c(:c:c:1-[#8]-[#1])-[#16](=[#8])(=[#8])-[#8]-[#1]","<regId=anil_OH_no_alk_A(1)>"
299
- "s:1:c:c:c(:c:1-[#1])-c:2:c:s:c(:n:2)-[#7](-[#1])-[#1]","<regId=thiazole_amine_L(1)>"
300
- "c1c(-[#7](-[#1])-[#1])nnc1-c2c(-[#6](-[#1])-[#1])oc(c2-[#1])-[#1]","<regId=pyrazole_amino_A(1)>"
301
- "n1nscc1-c2nc(no2)-[#6]:[#6]","<regId=het_thio_N_5D(1)>"
302
- "c:1(:c:c-3:c(:c:c:1)-[#7]-[#6]-4-c:2:c:c:c:c:c:2-[#6]-[#6]-3-4)-[#6;X4]","<regId=anil_alk_indane(1)>"
303
- "c:1-2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#6](-[#1])-[#6]-3-[#6](-[#6]#[#7])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#7]-2-3)-[#1]","<regId=anil_di_alk_N(1)>"
304
- "c:2-,:3:c(:c:c:1:c:c:c:c:c:1:c:2)-,:[#7](-[#6](-[#1])-[#1])-,:[#6](=[#8])-,:[#6](=,:[#7]-,:3)-[#6]:[#6]-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=het_666_C(1)>"
305
- "[#6](-[#8]-[#1]):[#6]-[#6](=[#8])-[#6](-[#1])=[#6](-[#6])-[#6]","<regId=ene_one_D(1)>"
306
- "c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]):c(:c(-[#1]):n:2-[#1])-[#16](=[#8])=[#8]","<regId=anil_di_alk_indol(1)>"
307
- "c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1]):c(:c(-[#1]):n:2-[#6](-[#1])-[#1])-[#1]","<regId=anil_no_alk_indol_A(1)>"
308
- "[#16;X2]-1-[#6]=[#6](-[#6]#[#7])-[#6](-[#6])(-[#6]=[#8])-[#6](=[#6]-1-[#7](-[#1])-[#1])-[$([#6]=[#8]),$([#6]#[#7])]","<regId=dhp_amino_CN_G(1)>"
309
- "[#7]-2-[#6]=[#6](-[#6]=[#8])-[#6](-c:1:c:c:c(:c:c:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#6]~3=,:[#6]-2~[#7]~[#6](~[#16])~[#7]~[#6]~3~[#7]","<regId=anil_di_alk_dhp(1)>"
310
- "c:1:c(:c:c:c:c:1)-[#6](=[#8])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#6](=[#8])-[#7](-[#1])-[#7](-[#1])-c:3:n:c:c:s:3","<regId=anthranil_amide_A(1)>"
311
- "c:1:c:2:c(:c:c:c:1):c(:c:3:c(:c:2):c:c:c:c:3)-[#6]=[#7]-[#7](-[#1])-c:4:c:c:c:c:c:4","<regId=hzone_anthran_Z(1)>"
312
- "c:1:c(:c:c:c:c:1)-[#6](-[#1])-[#7]-[#6](=[#8])-[#6](-[#7](-[#1])-[#6](-[#1])-[#1])=[#6](-[#1])-[#6](=[#8])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])-[#1]","<regId=ene_one_amide_A(1)>"
313
- "s:1:c(:c(-[#1]):c(:c:1-[#6]-3=[#7]-c:2:c:c:c:c:c:2-[#6](=[#7]-[#7]-3-[#1])-c:4:c:c:n:c:c:4)-[#1])-[#1]","<regId=het_76_A(1)>"
314
- "o:1:c(:c(-[#1]):c(:c:1-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7](-[#6]-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2)-[#1])-[#1]","<regId=thio_urea_N(1)>"
315
- "c:1:c(:c:c:c:c:1)-[#7](-[#6]-[#1])-[#6](-[#1])-[#6](-[#1])-[#6](-[#1])-[#7](-[#1])-[#6](=[#8])-[#6]-,:2=,:[#6](-,:[#8]-,:[#6](-,:[#6](=,:[#6]-,:2-[#6](-[#1])-[#1])-[#1])=[#8])-[#6](-[#1])-[#1]","<regId=anil_di_alk_coum(1)>"
316
- "c2-3:c:c:c:1:c:c:c:c:c:1:c2-[#6](-[#1])-[#6;X4]-[#7]-[#6]-3=[#6](-[#1])-[#6](=[#8])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=ene_one_amide_B(1)>"
317
- "c:1:c(:c:c:c:c:1)-[#6]-4=[#7]-[#7]:2:[#6](:[#7+]:c:3:c:2:c:c:c:c:3)-[#16]-[#6;X4]-4","<regId=het_thio_656c(1)>"
318
- "[#6]-2(=[#8])-[#6](=[#6](-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#7]=[#6](-c:1:c:c:c:c:c:1)-[#8]-2","<regId=het_5_ene(1)>"
319
- "c:1:c(:c:c:c:c:1)-[#7]-2-[#6](=[#8])-[#6](=[#6](-[#1])-[#6]-2=[#8])-[#16]-c:3:c:c:c:c:c:3","<regId=thio_imide_A(1)>"
320
- "[#7]-,:1(-[#1])-,:[#7]=,:[#6](-[#7]-[#1])-,:[#16]-,:[#6](=,:[#6]-,:1-,:[#6]:[#6])-,:[#6]:[#6]","<regId=dhp_amidine_A(1)>"
321
- "c:1(:c(:c-3:c(:c(:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])-c:2:c(:c(:c(:o:2)-[#6]-[#1])-[#1])-[#1])-[#1])-[#8]-[#6](-[#8]-3)(-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_O(1)>"
322
- "c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-c:2:c:c:c:c:c:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_O(1)>"
323
- "[#8]=[#6]-!@n:1:c:c:c-,:2:c:1-,:[#7](-[#1])-,:[#6](=[#16])-,:[#7]-,:2-[#1]","<regId=thio_urea_P(1)>"
324
- "[#6](-[F])(-[F])-[#6](=[#8])-[#7](-[#1])-c:1:c(-[#1]):n(-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6]:[#6]):n:c:1-[#1]","<regId=het_pyraz_misc(1)>"
325
- "[#7]-2=[#7]-[#6]:1:[#7]:[!#6&!#1]:[#7]:[#6]:1-[#7]=[#7]-[#6]:[#6]-2","<regId=diazox_C(1)>"
326
- "[#6]-2(-[#1])(-[#8]-[#1])-[#6]:1:[#7]:[!#6&!#1]:[#7]:[#6]:1-[#6](-[#1])(-[#8]-[#1])-[#6]=[#6]-2","<regId=diazox_D(1)>"
327
- "[#6]-1(-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#1])(-[#6](=[#8])-[#7](-[#1])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])(-[#1])-[#8])-[#16](=[#8])(=[#8])-[#6]:[#6]","<regId=misc_cyclopropane(1)>"
328
- "[#6]-1:[#6]-[#6](=[#8])-[#6]=[#6]-1-[#7]=[#6](-[#1])-[#7](-[#6;X4])-[#6;X4]","<regId=imine_ene_one_B(1)>"
329
- "c:1:c:c(:c:c-,:2:c:1-,:[#6](=,:[#6](-[#1])-,:[#6](=[#8])-,:[#8]-,:2)-c:3:c:c:c:c:c:3)-[#8]-[#6](-[#1])(-[#1])-[#6]:[#8]:[#6]","<regId=coumarin_D(1)>"
330
- "c:1:c(:o:c(:c:1-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#8]-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-c:2:c:c-3:c(:c:c:2)-[#8]-[#6](-[#8]-3)(-[#1])-[#1]","<regId=misc_furan_A(1)>"
331
- "[#7]-4(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#7](-[#1])-c:2:c:c:c:c:3:c:c:c:c:c:2:3)-[#6]-4=[#8]","<regId=rhod_sat_E(1)>"
332
- "[#7]-3(-[#6](=[#8])-c:1:c:c:c:c:c:1)-[#6](=[#7]-c:2:c:c:c:c:c:2)-[#16]-[#6](-[#1])(-[#1])-[#6]-3=[#8]","<regId=rhod_sat_imine_A(1)>"
333
- "[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#1])-[#6]-2=[#16]","<regId=rhod_sat_F(1)>"
334
- "[#7]-1(-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#6]:[#6])-[#6](=[#7]-[#6]:[#6])-[#6]-1=[#7]-[#6]:[#6]","<regId=het_thio_5_imine_B(1)>"
335
- "[#16]-1-[#6](=[#7]-[#7]-[#1])-[#16]-[#6](=[#7]-[#6]:[#6])-[#6]-1=[#7]-[#6]:[#6]","<regId=het_thio_5_imine_C(1)>"
336
- "[#6]-2(=[#8])-[#6](=[#6](-[#1])-c:1:c(:c:c:c(:c:1)-[F,Cl,Br,I])-[#8]-[#6](-[#1])-[#1])-[#7]=[#6](-[#16]-[#6](-[#1])-[#1])-[#16]-2","<regId=ene_five_het_N(1)>"
337
- "[#6](-[#1])(-[#1])-[#16]-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=thio_carbam_A(1)>"
338
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6])-[#1])-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=misc_anilide_A(1)>"
339
- "c:1(:c(:c(:c(:c(:c:1-[#6](-[#1])-[#1])-[#1])-[Br])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=misc_anilide_B(1)>"
340
- "c:1-2:c(:c:c:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#7](-[#6]:[#6]-[#8]-[#6](-[#1])-[#1])-[#6]-2(-[#1])-[#1])-[#1])-[#1]","<regId=mannich_B(1)>"
341
- "c:1-2:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2(-[#1])-[#1])-[#1])-[#8])-[#8])-[#1]","<regId=mannich_catechol_A(1)>"
342
- "[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=anil_alk_D(1)>"
343
- "n:1:2:c:c:c(:c:c:1:c:c(:c:2-[#6](=[#8])-[#6]:[#6])-[#6]:[#6])-[#6](~[#8])~[#8]","<regId=het_65_I(1)>"
344
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#6](=[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](-[#6;X4])(-[#6;X4])-[#7](-[#1])-[#6](=[#8])-[#7](-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=misc_urea_A(1)>"
345
- "[#6]-3(-[#1])(-n:1:c(:n:c(:c:1-[#1])-[#1])-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[Br])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c:4:c-3:c(:c(:c(:c:4-[#1])-[#1])-[#1])-[#1]","<regId=imidazole_C(1)>"
346
- "[#6](=[#6](-[#1])-[#6](-[#1])(-[#1])-n:1:c(:n:c(:c:1-[#1])-[#1])-[#1])(-[#6]:[#6])-[#6]:[#6]","<regId=styrene_imidazole_A(1)>"
347
- "c:1(:n:c(:c(-[#1]):s:1)-c:2:c:c:n:c:c:2)-[#7](-[#1])-[#6]:[#6]-[#6](-[#1])-[#1]","<regId=thiazole_amine_M(1)>"
348
- "c:1(:n:c(:c(-[#1]):s:1)-c:2:c:c:c:c:c:2)-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](-[#1])(-[#1])-c:3:c:c:c:n:3-[#1]","<regId=misc_pyrrole_thiaz(1)>"
349
- "n:1(-[#1]):c(:c(-[#6](-[#1])-[#1]):c(:c:1-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](=[#8])-[#8]-[#6](-[#1])-[#1]","<regId=pyrrole_L(1)>"
350
- "c:2(:n:c:1:c(:c(:c:c(:c:1-[#1])-[F,Cl,Br,I])-[#1]):n:2-[#1])-[#16]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=het_thio_65_D(1)>"
351
- "c:1(:c(:c-2:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1])-[#6]=[#6]-[#6](-[#1])-[#16]-2)-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=ene_misc_E(1)>"
352
- "[#7]-1(-[#1])-[#6](=[#16])-[#6](-[#1])(-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6]-1-[#6]:[#6])-[#1]","<regId=thio_cyano_A(1)>"
353
- "n:1:c(:c(:c(:c(:c:1-[#16;X2]-c:2:c:c:c:c:c:2-[#7](-[#1])-[#1])-[#6]#[#7])-c:3:c:c:c:c:c:3)-[#6]#[#7])-[#7](-[#1])-[#1]","<regId=cyano_amino_het_B(1)>"
354
- "[#7]-,:2(-c:1:c:c:c(:c:c:1)-[#8]-[#6](-[#1])-[#1])-,:[#6](=[#8])-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:2)-n:3:c:n:c:c:3)-[#6]#[#7]","<regId=cyano_pyridone_G(1)>"
355
- "o:1:c(:c:c:2:c:1:c(:c(:c(:c:2-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](~[#8])~[#8]","<regId=het_65_J(1)>"
356
- "[#6]#[#6]-[#6](=[#8])-[#6]#[#6]","<regId=ene_one_yne_A(1)>"
357
- "c:2(:c:1:c(:c(:c(:c(:c:1:c(:c(:c:2-[#8]-[#1])-[#6]=[#8])-[#1])-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_OH_no_alk_B(1)>"
358
- "c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6;!H0,$([#6]-[#6;!H0!H1])]-c:2:c:c:c:c(:c:2)-[*]-[*]-[*]-c:3:c:c:c:o:3","<regId=hzone_acyl_misc_A(1)>"
359
- "[#16](=[#8])(=[#8])-[#7](-[#1])-c:1:c(:c(:c(:s:1)-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#7]-[#1]","<regId=thiophene_F(1)>"
360
- "[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#8]-[#1])-[#6](-[#1])-[#1]","<regId=anil_OC_alk_E(1)>"
361
- "[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#6]=[#8])-[#16]","<regId=anil_OC_alk_F(1)>"
362
- "n1nnnc2cccc12","<regId=het_65_K(1)>"
363
- "c:1-,:2:c(-[#1]):s:c(:c:1-,:[#6](=[#8])-,:[#7]-,:[#7]=,:[#6]-,:2-[#7](-[#1])-[#1])-[#6]=[#8]","<regId=het_65_L(1)>"
364
- "c:1-,:3:c(:c:2:c(:c:c:1-[Br]):o:c:c:2)-,:[#6](=,:[#6]-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_E(1)>"
365
- "c:1-,:3:c(:c:c:c:c:1)-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-c:2:n:o:c:c:2-[Br])-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_F(1)>"
366
- "c:1-,:2:c(:c:c(:c:c:1-[F,Cl,Br,I])-[F,Cl,Br,I])-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-[#1])-,:[#6](=[#7]-[#1])-,:[#8]-,:2)-[#1]","<regId=coumarin_G(1)>"
367
- "c:1-,:3:c(:c:c:c:c:1)-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-c:2:n:c(:c:s:2)-[#6]:[#16]:[#6]-[#1])-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_H(1)>"
368
- "[#6](-[#1])(-[#1])-[#16;X2]-c:2:n:n:c:1-[#6]:[#6]-[#7]=[#6]-[#8]-c:1:n:2","<regId=het_thio_67_A(1)>"
369
- "[#16](=[#8])(=[#8])(-c:1:c:n(-[#6](-[#1])-[#1]):c:n:1)-[#7](-[#1])-c:2:c:n(:n:c:2)-[#6](-[#1])(-[#1])-[#6]:[#6]-[#8]-[#6](-[#1])-[#1]","<regId=sulfonamide_I(1)>"
370
- "c:1-2:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#8]-2)-[#6](-[#1])(-[#1])-[#7]-3-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]-3)-[#1])-[#1])-[#1]","<regId=het_65_mannich(1)>"
371
- "[#6](-[#1])(-[#1])-[#8]-[#6]:[#6]-[#6](-[#1])(-[#1])-[#7](-[#1])-c:2:c(:c(:c:1:n(:c(:n:c:1:c:2-[#1])-[#1])-[#6]-[#1])-[#1])-[#1]","<regId=anil_alk_A(1)>"
372
- "[#7]-4(-c:1:c:c:c:c:c:1)-[#6](=[#7+](-c:2:c:c:c:c:c:2)-[#6](=[#7]-c:3:c:c:c:c:c:3)-[#7]-4)-[#1]","<regId=het_5_inium(1)>"
373
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:1:s:c(:n:c:1:c:2)-[#16]-[#6](-[#1])-[#1]","<regId=anil_di_alk_P(1)>"
374
- "c:1:2:c(:c(:c(:c(:c:1:c(:c(-[#1]):c(:c:2-[#1])-[#1])-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6]:[#6]:[#6])-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_Q(1)>"
375
- "[#6]:1(:[#7]:[#6](:[#7]:[!#1]:[#7]:1)-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-[#16]-[#6;X4]","<regId=thio_pyridine_A(1)>"
376
- "n:1:c(:n:c(:n:c:1-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#6]-[#1])-[#6]=[#8]","<regId=melamine_B(1)>"
377
- "c:1(:n:s:c(:n:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](=[#8])-c:2:c:c:c:c:c:2-[#6](=[#8])-[#8]-[#1])-c:3:c:c:c:c:c:3","<regId=misc_phthal_thio_N(1)>"
378
- "n:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6](-[#1])-c:2:c:c:c:c:c:2-[#8]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=hzone_acyl_misc_B(1)>"
379
- "[#6](-[#1])(-[#1])(-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#8]-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c(:c(:c:2-[#1])-[#1])-[#8]-[#1])-[#1]","<regId=tert_butyl_B(1)>"
380
- "[#7](-[#1])(-[#1])-c:1:c(-[#7](-[#1])-[#1]):c(:c(-[#1]):c:2:n:o:n:c:1:2)-[#1]","<regId=diazox_E(1)>"
381
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])=[#8])-[#1])-[#7](-[#1])-[#6](-[#1])-[#1])-[F,Cl,Br,I])-[#1]","<regId=anil_NH_no_alk_B(1)>"
382
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#7]=[#6]-2-[#6](=[#6]~[#6]~[#6]=[#6]-2)-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_no_alk_A(1)>"
383
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-n:2:c:c:c:c:2)-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]","<regId=anil_no_alk_B(1)>"
384
- "[#16]=[#6]-[#6](-[#6](-[#1])-[#1])=[#6](-[#6](-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=thio_ene_amine_A(1)>"
385
- "[#6]-1:[#6]-[#8]-[#6]-2-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]-[#6]-1-2","<regId=het_55_B(1)>"
386
- "[#8]-[#6](=[#8])-[#6](-[#1])(-[#1])-[#16;X2]-[#6](=[#7]-[#6]#[#7])-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=cyanamide_A(1)>"
387
- "[#8]=[#6]-[#6]-1=[#6](-[#16]-[#6](=[#6](-[#1])-[#6])-[#16]-1)-[#6]=[#8]","<regId=ene_one_one_A(1)>"
388
- "[#8]=[#6]-1-[#7]-[#7]-[#6](=[#7]-[#6]-1=[#6]-[#1])-[!#1]:[!#1]","<regId=ene_six_het_D(1)>"
389
- "[#8]=[#6]-[#6](-[#1])=[#6](-[#6]#[#7])-[#6]","<regId=ene_cyano_E(1)>"
390
- "[#8](-[#1])-[#6](=[#8])-c:1:c(:c(:c(:c(:c:1-[#8]-[#1])-[#1])-c:2:c(-[#1]):c(:c(:o:2)-[#6](-[#1])=[#6](-[#6]#[#7])-c:3:n:c:c:n:3)-[#1])-[#1])-[#1]","<regId=ene_cyano_F(1)>"
391
- "c:1:c(:c:c:c:c:1)-[#7](-c:2:c:c:c:c:c:2)-[#7]=[#6](-[#1])-[#6]:3:[#6](:[#6](:[#6](:[!#1]:3)-c:4:c:c:c:c(:c:4)-[#6](=[#8])-[#8]-[#1])-[#1])-[#1]","<regId=hzone_furan_C(1)>"
392
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-c:2:c(-[#1]):c(:c(-[#6](-[#1])-[#1]):o:2)-[#6]=[#8])-[#1])-[#1]","<regId=anil_no_alk_C(1)>"
393
- "[#8](-[#1])-[#6](=[#8])-c:1:c:c:c(:c:c:1)-[#7]-[#7]=[#6](-[#1])-[#6]:2:[#6](:[#6](:[#6](:[!#1]:2)-c:3:c:c:c:c:c:3)-[#1])-[#1]","<regId=hzone_acid_D(1)>"
394
- "[#8](-[#1])-[#6](=[#8])-c:1:c:c:c:c(:c:1)-[#6]:[!#1]:[#6]-[#6]=[#7]-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#8]","<regId=hzone_furan_E(1)>"
395
- "[#8](-[#1])-[#6]:1:[#6](:[#6]:[!#1]:[#6](:[#7]:1)-[#7](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]","<regId=het_6_pyridone_NH2(1)>"
396
- "[#6]-1(=[!#6&!#1])-[#6](-[#7]=[#6]-[#16]-1)=[#8]","<regId=imine_one_fives_D(1)>"
397
- "n2(-c:1:c:c:c:c:c:1)c(c(-[#1])c(c2-[#6]=[#7]-[#8]-[#1])-[#1])-[#1]","<regId=pyrrole_M(1)>"
398
- "n2(-[#6](-[#1])-c:1:c(:c(:c:c(:c:1-[#1])-[#1])-[#1])-[#1])c(c(-[#1])c(c2-[#6]-[#1])-[#1])-[#6]-[#1]","<regId=pyrrole_N(1)>"
399
- "n1(-[#6](-[#1])-[#1])c(c(-[#6](=[#8])-[#6])c(c1-[#6]:[#6])-[#6])-[#6](-[#1])-[#1]","<regId=pyrrole_O(1)>"
400
- "n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])=[#6](-[#6]#[#7])-c:2:n:c:c:s:2)-[#1])-[#1]","<regId=ene_cyano_G(1)>"
401
- "n3(-c:1:c:c:c:c:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-c:2:c:c:c:s:2)c(c(-[#1])c(c3-[#1])-[#1])-[#1]","<regId=sulfonamide_J(1)>"
402
- "n2(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6]:[#6])c(c(-[#1])c(c2-[#1])-[#1])-[#1]","<regId=misc_pyrrole_benz(1)>"
403
- "c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6](-[#1])=[#6](-[#1])-[#6]=[#8]","<regId=thio_urea_R(1)>"
404
- "[#6]-1(-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6]-[#6](-[#1])(-[#1])-[#6]-1=[#8])=[#6](-[#7]-[#1])-[#6]=[#8]","<regId=ene_one_one_B(1)>"
405
- "[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#16]-[#6;X4]-[#16]-1","<regId=dhp_amino_CN_H(1)>"
406
- "[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-c:2:c:c:n:c:3:c(:c:c:c(:c:2:3)-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=het_66_anisole(1)>"
407
- "[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#7](-[#1])-c:2:n:c(:c:s:2)-c:3:c:c:c(:c:c:3)-[#8]-[#6](-[#1])-[#1]","<regId=thiazole_amine_N(1)>"
408
- "[#6]~1~3~[#7](-[#6]:[#6])~[#6]~[#6]~[#6]~[#6]~1~[#6]~2~[#7]~[#6]~[#6]~[#6]~[#7+]~2~[#7]~3","<regId=het_pyridiniums_C(1)>"
409
- "[#7]-3(-c:2:c:1:c:c:c:c:c:1:c:c:c:2)-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6]-3=[#8]","<regId=het_5_E(1)>"
410
- "[#6]-1(=[#6;!H0,$([#6]-[#6;!H0;!H1]),$([#6]-[#6]=[#8])]-[#16]-[#6](-[#7;!H0,$([#7]-[#6;!H0]),$([#7]-[#6]:[#6])]-1)=[#7;!R])-[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thiaz_ene_A(128)>"
411
- "n2(-[#6]:1:[!#1]:[#6]:[#6]:[#6]:[#6]:1)c(cc(c2-[#6;X4])-[#1])-[#6;X4]","<regId=pyrrole_A(118)>"
412
- "c:1:c:c(:c(:c:c:1)-[#8]-[#1])-[#8]-[#1]","<regId=catechol_A(92)>"
413
- "[#6]-1(=[#6])-[#6](-[#7]=[#6]-[#16]-1)=[#8]","<regId=ene_five_het_B(90)>"
414
- "[#6]-1=[!#1]-[!#6&!#1]-[#6](-[#6]-1=[!#6&!#1;!R])=[#8]","<regId=imine_one_fives(89)>"
415
- "[#6]-1(-[#6](-[#6]=[#6]-[!#6&!#1]-1)=[#6])=[!#6&!#1]","<regId=ene_five_het_C(85)>"
416
- "[#6]-[#7]-1-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#1])-[#7]=[#6](-[#1])-[#6]:[!#1]","<regId=hzone_pipzn(79)>"
417
- "c:1-2:c(:c:c:c:c:1)-[#6](=[#8])-[#6;X4]-[#6]-2=[#8]","<regId=keto_keto_beta_A(68)>"
418
- "n1(-[#6])c(c(-[#1])c(c1-[#6]=[#7]-[#7])-[#1])-[#1]","<regId=hzone_pyrrol(64)>"
419
- "[#6]=!@[#6](-[!#1])-@[#6](=!@[!#6&!#1])-@[#6](=!@[#6])-[!#1]","<regId=ene_one_ene_A(57)>"
420
- "[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[#7](-[#1])-[#1])=[#6]-[#6]#[#7]","<regId=cyano_ene_amine_A(56)>"
421
- "c:1-2:c(:c:c:c:c:1)-[#6](=[#8])-[#6](=[#6])-[#6]-2=[#8]","<regId=ene_five_one_A(55)>"
422
- "[#6]-,:1(=,:[!#1]-,:[!#1]=,:[!#1]-,:[#7](-,:[#6]-,:1=[#16])-[#1])-[#6]#[#7]","<regId=cyano_pyridone_A(54)>"
423
- "c:1:c:c-2:c(:c:c:1)-[#6]-3-[#6](-[#6]-[#7]-2)-[#6]-[#6]=[#6]-3","<regId=anil_alk_ene(51)>"
424
- "c:1:c:2:c(:c:c:c:1):n:c:3:c(:c:2-[#7]):c:c:c:c:3","<regId=amino_acridine_A(46)>"
425
- "[#6]-1(=[#6])-[#6](=[#8])-[#7]-[#7]-[#6]-1=[#8]","<regId=ene_five_het_D(46)>"
426
- "[#7](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[!#1])-[!#1])-[#6]=[#8]","<regId=thiophene_amino_Aa(45)>"
427
- "[#7]-[#6]=!@[#6]-2-[#6](=[#8])-c:1:c:c:c:c:c:1-[!#6&!#1]-2","<regId=ene_five_het_E(44)>"
428
- "c:1(:c(:c(:c(:c(:c:1-[#8]-[#1])-[F,Cl,Br,I])-[#1])-[F,Cl,Br,I])-[#1])-[#16](=[#8])(=[#8])-[#7]","<regId=sulfonamide_A(43)>"
429
- "[#6]-[#6](=[#16])-[#6]","<regId=thio_ketone(43)>"
430
- "c:1:c:c(:c:c:c:1-[#8]-[#1])-[#7](-[#1])-[#16](=[#8])=[#8]","<regId=sulfonamide_B(41)>"
431
- "c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[$([#8]),$([#7]),$([#6](-[#1])-[#1])])-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_no_alk(40)>"
432
- "[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#6]:[#6])]:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6])-[#6](=[#8])-[#8])-[$([#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1),$([#6]:1:[#16]:[#6]:[#6]:[#6]:1)]","<regId=thiophene_amino_Ab(40)>"
433
- "[#7+]:1(:[#6]:[#6]:[!#1]:c:2:c:1:c(:[c;!H0,$(c-[#7])]:c:c:2)-[#1])-[$([#6](-[#1])(-[#1])-[#1]),$([#8;X1]),$([#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]),$([#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]-[#1]),$([#6](-[#1])(-[#1])-[#6](=[#8])-[#6]),$([#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]),$([#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#1])]","<regId=het_pyridiniums_A(39)>"
434
- "c:1:c:c:c:c(:c:1-[#7&!H0;!H1,!$([#7]-[#6]=[#8])])-[#6](-[#6]:[#6])=[#8]","<regId=anthranil_one_A(38)>"
435
- "[#7](-[#1])-[#7]=[#6](-[#6]#[#7])-[#6]=[!#6&!#1;!R]","<regId=cyano_imine_A(37)>"
436
- "[#7](-c:1:c:c:c:c:c:1)-[#16](=[#8])(=[#8])-[#6]:2:[#6]:[#6]:[#6]:[#6]:3:[#7]:[$([#8]),$([#16])]:[#7]:[#6]:2:3","<regId=diazox_sulfon_A(36)>"
437
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])=[#7]-[#7]-[$([#6](=[#8])-[#6](-[#1])(-[#1])-[#16]-[#6]:[#7]),$([#6](=[#8])-[#6](-[#1])(-[#1])-[!#1]:[!#1]:[#7]),$([#6](=[#8])-[#6]:[#6]-[#8]-[#1]),$([#6]:[#7]),$([#6](-[#1])(-[#1])-[#6](-[#1])-[#8]-[#1])])-[#1])-[#1]","<regId=hzone_anil_di_alk(35)>"
438
- "[#7]-1-[#6](=[#16])-[#16]-[#6;X4]-[#6]-1=[#8]","<regId=rhod_sat_A(33)>"
439
- "[#7](-[#1])-[#7]=[#6]-[#6;!H0,$([#6]-[#6])]=[#6](-[#6])-!@[$([#7]),$([#8]-[#1])]","<regId=hzone_enamin(30)>"
440
- "n2(-[#6]:1:[!#1]:[#6]:[#6]:[#6]:[#6]:1)c(cc(c2-[#6]:[#6])-[#1])-[#6;X4]","<regId=pyrrole_B(29)>"
441
- "s1ccc(c1)-[#8]-[#1]","<regId=thiophene_hydroxy(28)>"
442
- "[#6]-,:1(=,:[#6](-,:[#6](=[#8])-,:[#7]-,:[#6](=,:[#7]-,:1)-,:[!#6&!#1])-[#6]#[#7])-[#6]","<regId=cyano_pyridone_B(27)>"
443
- "[#6]-1(-[#6](=[#8])-[#7]-[#6](=[#8])-[#7]-[#6]-1=[#8])=[#7]","<regId=imine_one_sixes(27)>"
444
- "[#6](-[#1])(-[#1])-[#7]([#6]:[#6])~[#6][#6]=,:[#6]-[#6]~[#6][#7]","<regId=dyes5A(27)>"
445
- "c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7]-,:[#6]=,:[#7]-,:3","<regId=naphth_amino_A(25)>"
446
- "c:2:c:1:c:c:c:c-3:c:1:c(:c:c:2)-[#7](-[#6;X4]-[#7]-3-[#1])-[#1]","<regId=naphth_amino_B(25)>"
447
- "[#6]-[#6](=[#8])-[#6](-[#1])=[#6](-[#7](-[#1])-[#6])-[#6](=[#8])-[#8]-[#6]","<regId=ene_one_ester(24)>"
448
- "[#16]=[#6]-1-[#6]=,:[#6]-[!#6&!#1]-[#6]=,:[#6]-1","<regId=thio_dibenzo(23)>"
449
- "[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[$([#6]#[#7]),$([#6]=[#7])])-[#6]#[#7]","<regId=cyano_cyano_A(23)>"
450
- "c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#8]-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=hzone_acyl_naphthol(22)>"
451
- "[#8]=[#6]-c2c1nc(-[#6](-[#1])-[#1])cc(-[#8]-[#1])n1nc2","<regId=het_65_A(21)>"
452
- "n:1:c(:n(:c(:c:1-c:2:c:c:c:c:c:2)-c:3:c:c:c:c:c:3)-[#1])-[#6]:[!#1]","<regId=imidazole_A(19)>"
453
- "[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1","<regId=ene_cyano_A(19)>"
454
- "c:1(:c:c:c:c:c:1-[#7](-[#1])-[#7]=[#6])-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_A(19)>"
455
- "[#7+]([#6]:[#6])=,:[#6]-[#6](-[#1])=[#6]-[#7](-[#6;X4])-[#6]","<regId=dyes3A(19)>"
456
- "[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#7](-[#1])-[#1])-[#16]-1)-[#6]#[#7]","<regId=dhp_bis_amino_CN(19)>"
457
- "[#7]~[#6]:1:[#7]:[#7]:[#6](:[$([#7]),$([#6]-[#1]),$([#6]-[#7]-[#1])]:[$([#7]),$([#6]-[#7])]:1)-[$([#7]-[#1]),$([#8]-[#6](-[#1])-[#1])]","<regId=het_6_tetrazine(18)>"
458
- "[#6]-[#6]=[#6](-[F,Cl,Br,I])-[#6](=[#8])-[#6]","<regId=ene_one_hal(17)>"
459
- "[#6](-[#6]#[#7])(-[#6]#[#7])=[#7]-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=cyano_imine_B(17)>"
460
- "[#6]-,:1(=,:[#6](-!@[#6](=[#8])-[#7]-[#6](-[#1])-[#1])-,:[#16]-,:[#6](-,:[#7]-,:1-,:[$([#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]),$([#6]:[#6])])=[#16])-,:[$([#7]-[#6](=[#8])-[#6]:[#6]),$([#7](-[#1])-[#1])]","<regId=thiaz_ene_B(17)>"
461
- "[#16]-1-[#6](=[#8])-[#7]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[$([#6]-[#35]),$([#6]:[#6](-[#1]):[#6](-[F,Cl,Br,I]):[#6]:[#6]-[F,Cl,Br,I]),$([#6]:[#6](-[#1]):[#6](-[#1]):[#6]-[#16]-[#6](-[#1])-[#1]),$([#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#8]-[#6](-[#1])-[#1]),$([#6]:1:[#6](-[#6](-[#1])-[#1]):[#7](-[#6](-[#1])-[#1]):[#6](-[#6](-[#1])-[#1]):[#6]:1)]","<regId=ene_rhod_B(16)>"
462
- "[#8]-,:1-,:[#6](-,:[#16]-,:c:2:c-,:1:c:c:c(:c:2)-,:[$([#7]),$([#8])])=[$([#8]),$([#16])]","<regId=thio_carbonate_A(15)>"
463
- "[#7](-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-c:1:c(:c(:c(:o:1)-[#6]=[#7]-[#7](-[#1])-[#6]=[!#6&!#1])-[#1])-[#1]","<regId=anil_di_alk_furan_A(15)>"
464
- "c:1(:c:c:c:c:c:1)-[#6](-[#1])=!@[#6]-3-[#6](=[#8])-c:2:c:c:c:c:c:2-[#16]-3","<regId=ene_five_het_F(15)>"
465
- "[#6]-1(-[#6](~[!#6&!#1]~[#6]-[!#6&!#1]-[#6]-1=[!#6&!#1])~[!#6&!#1])=[#6;!R]-[#1]","<regId=ene_six_het_A(483)>"
466
- "c:1:c:c(:c(:c:c:1)-[#6]=[#7]-[#7])-[#8]-[#1]","<regId=hzone_phenol_A(479)>"
467
- "[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])](:c:1))-[#7])-[#1]","<regId=anil_di_alk_A(478)>"
468
- "[n;!H0,$(n-[#6;!H0;!H1])]:1(c(c(c:2:c:1:c:c:c:c:2-[#1])-[#6;X4]-[#1])-[$([#6](-[#1])-[#1]),$([#6]=,:[!#6&!#1]),$([#6](-[#1])-[#7]),$([#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](-[#1])-[#1])])","<regId=indol_3yl_alk(461)>"
469
- "[!#6&!#1]=[#6]-1-[#6]=,:[#6]-[#6](=[!#6&!#1])-[#6]=,:[#6]-1","<regId=quinone_A(370)>"
470
- "[#7;!R]=[#7]","<regId=azo_A(324)>"
471
- "[#6]-[#6](=[!#6&!#1;!R])-[#6](=[!#6&!#1;!R])-[$([#6]),$([#16](=[#8])=[#8])]","<regId=imine_one_A(321)>"
472
- "[#7]-[#6;X4]-c:1:c:c:c:c:c:1-[#8]-[#1]","<regId=mannich_A(296)>"
473
- "c:1:c:c(:c:c:c:1-[#7](-[#6;X4])-[#6;X4])-[#6]=[#6]","<regId=anil_di_alk_B(251)>"
474
- "c:1:c:c(:c:c:c:1-[#8]-[#6;X4])-[#7;$([#7!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])]","<regId=anil_di_alk_C(246)>"
475
- "[#7]-1-[#6](=[#16])-[#16]-[#6](=[#6])-[#6]-1=[#8]","<regId=ene_rhod_A(235)>"
476
- "c:1(:c:c:c(:c:c:1)-[#6]=[#7]-[#7])-[#8]-[#1]","<regId=hzone_phenol_B(215)>"
477
- "[#6]-1(=[#6])-[#6]=[#7]-[!#6&!#1]-[#6]-1=[#8]","<regId=ene_five_het_A(201)>"
478
- "c:1:c:c(:c:c:c:1-[#7](-[#6;X4])-[#6;X4])-[#6;X4]-[$([#8]-[#1]),$([#6]=[#6]-[#1]),$([#7]-[#6;X4])]","<regId=anil_di_alk_D(198)>"
479
- "[#8]=[#6]-2-[#6](=!@[#7]-[#7])-c:1:c:c:c:c:c:1-[#7]-2","<regId=imine_one_isatin(189)>"
480
- "[#6](-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1])](:c:1-[#1]))-[#6&!H0;!H1,$([#6]-[#6;!H0])])-[#1])-[#1]","<regId=anil_di_alk_E(186)>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/edm.py CHANGED
@@ -8,7 +8,6 @@ import torch.nn.functional as F
8
  from src import utils
9
  from src.egnn import Dynamics
10
  from src.noise import GammaNetwork, PredefinedNoiseSchedule
11
- from utils.logging_utils import get_logger
12
 
13
 
14
  class EDM(torch.nn.Module):
@@ -733,8 +732,6 @@ class InpaintingEDM(EDM):
733
  class RobustEDM(EDM):
734
  @torch.no_grad()
735
  def sample_chain(self, x, h, node_mask, fragment_mask, linker_mask, edge_mask, context, keep_frames=None):
736
- logger = get_logger()
737
-
738
  n_samples = x.size(0)
739
  n_nodes = x.size(1)
740
 
 
8
  from src import utils
9
  from src.egnn import Dynamics
10
  from src.noise import GammaNetwork, PredefinedNoiseSchedule
 
11
 
12
 
13
  class EDM(torch.nn.Module):
 
732
  class RobustEDM(EDM):
733
  @torch.no_grad()
734
  def sample_chain(self, x, h, node_mask, fragment_mask, linker_mask, edge_mask, context, keep_frames=None):
 
 
735
  n_samples = x.size(0)
736
  n_nodes = x.size(1)
737
 
src/lightning.py CHANGED
@@ -2,7 +2,6 @@ import numpy as np
2
  import os
3
  import pytorch_lightning as pl
4
  import torch
5
- import wandb
6
 
7
  from src import metrics, utils, delinker
8
  from src.const import LINKER_SIZE_DIST
@@ -13,7 +12,6 @@ from src.datasets import (
13
  )
14
  from src.linker_size import DistributionNodes
15
  from src.molecule_builder import build_molecules
16
- from src.visualizer import save_xyz_files, visualize_chain
17
  from typing import Dict, List, Optional
18
  from tqdm import tqdm
19
 
@@ -308,22 +306,6 @@ class DDPM(pl.LightningModule):
308
  self.log(f'{metric_name}/test', metric_value, prog_bar=True)
309
  self.metrics.setdefault(f'{metric_name}/test', []).append(metric_value)
310
 
311
- def generate_animation(self, chain_batch, node_mask, batch_i):
312
- batch_indices, mol_indices = utils.get_batch_idx_for_animation(self.batch_size, batch_i)
313
- for bi, mi in zip(batch_indices, mol_indices):
314
- chain = chain_batch[:, bi, :, :]
315
- name = f'mol_{mi}'
316
- chain_output = os.path.join(self.samples_dir, f'epoch_{self.current_epoch}', name)
317
- os.makedirs(chain_output, exist_ok=True)
318
-
319
- one_hot = chain[:, :, 3:-1] if self.include_charges else chain[:, :, 3:]
320
- positions = chain[:, :, :3]
321
- chain_node_mask = torch.cat([node_mask[bi].unsqueeze(0) for _ in range(self.FRAMES)], dim=0)
322
- names = [f'{name}_{j}' for j in range(self.FRAMES)]
323
-
324
- save_xyz_files(chain_output, one_hot, positions, chain_node_mask, names=names, is_geom=self.is_geom)
325
- visualize_chain(chain_output, wandb=wandb, mode=name, is_geom=self.is_geom)
326
-
327
  def sample_and_analyze(self, dataloader):
328
  pred_molecules = []
329
  true_molecules = []
 
2
  import os
3
  import pytorch_lightning as pl
4
  import torch
 
5
 
6
  from src import metrics, utils, delinker
7
  from src.const import LINKER_SIZE_DIST
 
12
  )
13
  from src.linker_size import DistributionNodes
14
  from src.molecule_builder import build_molecules
 
15
  from typing import Dict, List, Optional
16
  from tqdm import tqdm
17
 
 
306
  self.log(f'{metric_name}/test', metric_value, prog_bar=True)
307
  self.metrics.setdefault(f'{metric_name}/test', []).append(metric_value)
308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  def sample_and_analyze(self, dataloader):
310
  pred_molecules = []
311
  true_molecules = []