Spaces:

libokj
/

GenFBDD

Sleeping

App Files Files Community

libokj commited on 30 days ago

Commit

c17cba8

1 Parent(s): 32e688d

Minify

Browse files

Files changed (30) hide show

app/db.py +1 -1
app/{utils.py → fn.py} +0 -2
app/main.py +12 -17
app/static.py +1 -0
compute_metrics.py +451 -0
confidence/confidence_train.py +0 -320
confidence/dataset.py +0 -276
{confidence → datasets}/__init__.py +0 -0
datasets/conformer_matching.py +85 -0
datasets/constants.py +179 -0
datasets/dataloader.py +101 -0
datasets/esm_embedding_preparation.py +89 -0
datasets/esm_embeddings_to_pt.py +16 -0
datasets/loader.py +123 -0
datasets/moad.py +547 -0
datasets/parse_chi.py +146 -0
datasets/pdb.py +536 -0
datasets/pdbbind.py +472 -0
datasets/process_mols.py +499 -0
datasets/sidechain_esm_embeddings_to_pt.py +39 -0
inference.py +1 -2
requirements.txt +30 -0
resources/animations/example_3.gif +0 -0
resources/animations/example_6.gif +0 -0
resources/linker_size_distributions.png +0 -0
resources/moad_test_pdbs.txt +0 -90
resources/moad_val_pdbs.txt +0 -153
resources/wehi_pains.csv +0 -480
src/edm.py +0 -3
src/lightning.py +0 -18

app/db.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pytz
 import requests
 from tinydb import TinyDB, where
-from app.utils import send_email
 SERVER_DATA_DIR = os.getenv('DATA', 'results')
 DB_EXPIRY = timedelta(hours=48).total_seconds()

 import requests
 from tinydb import TinyDB, where
+from app.fn import send_email
 SERVER_DATA_DIR = os.getenv('DATA', 'results')
 DB_EXPIRY = timedelta(hours=48).total_seconds()

app/{utils.py → fn.py} RENAMED Viewed

@@ -644,8 +644,6 @@ def download_file(url):
         return None
 def uniprot_to_pdb(uniprot_id):
     """Queries the RCSB PDB API to find PDB entities associated with a UniProt ID."""
     base_url = "https://search.rcsb.org/rcsbsearch/v2/query"

         return None
 def uniprot_to_pdb(uniprot_id):
     """Queries the RCSB PDB API to find PDB entities associated with a UniProt ID."""
     base_url = "https://search.rcsb.org/rcsbsearch/v2/query"

app/main.py CHANGED Viewed

@@ -1,15 +1,10 @@
 import uuid
 import zipfile
 from datetime import datetime
-from email.mime.multipart import MIMEMultipart
-from email.mime.text import MIMEText
-from email.utils import formatdate, make_msgid
 from pathlib import Path
 from time import sleep, time
-import requests
 import torch
-import yaml
 from email_validator import validate_email, EmailNotValidError
 from Bio import SeqIO
 import gradio as gr
@@ -18,11 +13,10 @@ from omegaconf import OmegaConf
 import pandas as pd
 from rdkit import Chem
 from rdkit.Chem import PandasTools
-from tinydb import where
 from inference import (read_fragment_library, process_fragment_library, extract_pockets,
                        dock_fragments, generate_linkers, select_fragment_pairs)
-from app import static, utils, db
 gr.set_static_paths(paths=["data/", "results/"])
@@ -65,7 +59,7 @@ def process_drug_library_upload(library_upload):
         )
     else:
         raise gr.Error('Current supported fragment library formats only include CSV and SDF files.')
-    utils.validate_columns(df, ['X1'])
     return df
@@ -511,8 +505,8 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
             result_protein_file = gr.File(visible=False, interactive=False)
             with gr.Column(variant='panel'):
                 with gr.Row():
-                    scores = gr.CheckboxGroup(list(utils.SCORE_MAP.keys()), label='Compound Scores')
-                    filters = gr.CheckboxGroup(list(utils.FILTER_MAP.keys()), label='Compound Filters')
                 with gr.Row():
                     prop_clr_btn = gr.ClearButton(value='Clear Properties', interactive=False)
                     prop_calc_btn = gr.Button(value='Calculate Properties', interactive=False)
@@ -592,7 +586,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
             }
         elif filepath.suffix == '.fasta':
             seq = next(SeqIO.parse(file, 'fasta')).seq
-            filepath = utils.pdb_query(seq, method='FASTA Sequence')
             return {
                 input_prot_file: gr.File(str(filepath), visible=True),
                 prot_query_input: seq,
@@ -606,7 +600,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
     #     outputs=[prot_file],
     # )
     # prot_file.change(
-    #     fn=lambda file: gr.HTML(utils.create_complex_view_html(file), visible=True),
     #     inputs=[prot_file],
     #     outputs=[input_prot_view],
     # )
@@ -618,7 +612,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
     )
     prot_query_btn.click(
-        fn=utils.pdb_query,
         inputs=[prot_query_input, prot_query_dropdown],
         outputs=[input_prot_file],
     )
@@ -640,7 +634,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
     # pocket_extract_btn.click(
     #     fn=lambda: gr.Info('Extracting pocket...'),
     # ).then(
-    #     fn=utils.extract_pockets_and_update_view,
     #     js=static.RETURN_LIGAND_SELECTION_JS,
     #     inputs=[prot_file, selected_ligand],
     #     outputs=[input_prot_view, pocket_path_dict, selected_ligand, selected_pocket],
@@ -759,11 +753,11 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
         try:
             for filter_name in filter_list:
                 mod_df[filter_name] = mod_df['Compound'].parallel_apply(
-                    lambda x: utils.FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
             for score_name in score_list:
                 mod_df[score_name] = mod_df['Compound'].parallel_apply(
-                    lambda x: utils.SCORE_MAP[score_name](x) if not pd.isna(x) else x)
             return {result_table_mod_df: mod_df}
@@ -784,7 +778,7 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
         outputs=[result_protein_file],
     )
     result_table_mod_df.change(
-        fn=utils.create_result_table_html,
         inputs=[result_table_mod_df],
         outputs=[result_table_view]
     ).success(
@@ -828,4 +822,5 @@ with gr.Blocks(theme=THEME, title='GenFBDD', css=static.CSS, delete_cache=(3600,
 demo.launch(
     server_name='0.0.0.0',
     max_file_size="5mb",
 )

 import uuid
 import zipfile
 from datetime import datetime
 from pathlib import Path
 from time import sleep, time
 import torch
 from email_validator import validate_email, EmailNotValidError
 from Bio import SeqIO
 import gradio as gr
 import pandas as pd
 from rdkit import Chem
 from rdkit.Chem import PandasTools
 from inference import (read_fragment_library, process_fragment_library, extract_pockets,
                        dock_fragments, generate_linkers, select_fragment_pairs)
+from app import static, fn, db
 gr.set_static_paths(paths=["data/", "results/"])
         )
     else:
         raise gr.Error('Current supported fragment library formats only include CSV and SDF files.')
+    fn.validate_columns(df, ['X1'])
     return df
             result_protein_file = gr.File(visible=False, interactive=False)
             with gr.Column(variant='panel'):
                 with gr.Row():
+                    scores = gr.CheckboxGroup(list(fn.SCORE_MAP.keys()), label='Compound Scores')
+                    filters = gr.CheckboxGroup(list(fn.FILTER_MAP.keys()), label='Compound Filters')
                 with gr.Row():
                     prop_clr_btn = gr.ClearButton(value='Clear Properties', interactive=False)
                     prop_calc_btn = gr.Button(value='Calculate Properties', interactive=False)
             }
         elif filepath.suffix == '.fasta':
             seq = next(SeqIO.parse(file, 'fasta')).seq
+            filepath = fn.pdb_query(seq, method='FASTA Sequence')
             return {
                 input_prot_file: gr.File(str(filepath), visible=True),
                 prot_query_input: seq,
     #     outputs=[prot_file],
     # )
     # prot_file.change(
+    #     fn=lambda file: gr.HTML(fn.create_complex_view_html(file), visible=True),
     #     inputs=[prot_file],
     #     outputs=[input_prot_view],
     # )
     )
     prot_query_btn.click(
+        fn=fn.pdb_query,
         inputs=[prot_query_input, prot_query_dropdown],
         outputs=[input_prot_file],
     )
     # pocket_extract_btn.click(
     #     fn=lambda: gr.Info('Extracting pocket...'),
     # ).then(
+    #     fn=fn.extract_pockets_and_update_view,
     #     js=static.RETURN_LIGAND_SELECTION_JS,
     #     inputs=[prot_file, selected_ligand],
     #     outputs=[input_prot_view, pocket_path_dict, selected_ligand, selected_pocket],
         try:
             for filter_name in filter_list:
                 mod_df[filter_name] = mod_df['Compound'].parallel_apply(
+                    lambda x: fn.FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
             for score_name in score_list:
                 mod_df[score_name] = mod_df['Compound'].parallel_apply(
+                    lambda x: fn.SCORE_MAP[score_name](x) if not pd.isna(x) else x)
             return {result_table_mod_df: mod_df}
         outputs=[result_protein_file],
     )
     result_table_mod_df.change(
+        fn=fn.create_result_table_html,
         inputs=[result_table_mod_df],
         outputs=[result_table_view]
     ).success(
 demo.launch(
     server_name='0.0.0.0',
     max_file_size="5mb",
+    ssr_mode=False
 )

app/static.py CHANGED Viewed

@@ -788,6 +788,7 @@ SETUP_JS = """
     scripts.forEach((script) => {
         const scriptElement = document.createElement("script");
         scriptElement.src = script;
         document.head.appendChild(scriptElement);
     });

     scripts.forEach((script) => {
         const scriptElement = document.createElement("script");
         scriptElement.src = script;
+        scriptElement.async = true;
         document.head.appendChild(scriptElement);
     });

compute_metrics.py ADDED Viewed

	@@ -0,0 +1,451 @@

+#!/usr/bin/env python
+import csv
+import numpy as np
+import pandas as pd
+import sys
+from networkx.algorithms import isomorphism
+from rdkit import Chem
+from rdkit.Chem import MolStandardize, QED, rdMolAlign, rdMolDescriptors
+from src.delinker_utils import calc_SC_RDKit, frag_utils, sascorer
+from src.utils import disable_rdkit_logging
+from tqdm import tqdm
+from pdb import set_trace
+disable_rdkit_logging()
+if len(sys.argv) != 9:
+    print("Not provided all arguments")
+    quit()
+data_set = sys.argv[1]  # Options: ZINC, CASF
+gen_smi_file = sys.argv[2]  # Path to generated molecules
+train_set_path = sys.argv[3]  # Path to training set
+n_cores = int(sys.argv[4])  # Number of cores to use
+verbose = bool(sys.argv[5])  # Output results
+if sys.argv[6] == "None":
+    restrict = None
+else:
+    restrict = int(sys.argv[6])  # Set to None if don't want to restrict
+pains_smarts_loc = sys.argv[7]  # Path to PAINS SMARTS
+method = sys.argv[8]
+assert method in ['diffusion', '3dlinker', 'delinker']
+if verbose:
+    print("##### Start Settings #####")
+    print("Data set:", data_set)
+    print("Generated smiles file:", gen_smi_file)
+    print("Training set:", train_set_path)
+    print("Number of cores:", n_cores)
+    print("Verbose:", verbose)
+    print("Restrict data:", restrict)
+    print("PAINS SMARTS location:", pains_smarts_loc)
+    print("#####  End Settings  #####")
+# Load molecules
+# FORMAT: (Starting fragments (SMILES), Original molecule (SMILES), Generated molecule (SMILES), Generated linker)
+data = []
+with open(gen_smi_file, 'r') as f:
+    for line in tqdm(f.readlines()):
+        parts = line.strip().split(' ')
+        data.append({
+            'fragments': parts[0],
+            'true_molecule': parts[1],
+            'pred_molecule': parts[2],
+            'pred_linker': parts[3] if len(parts) > 3 else '',
+        })
+if restrict is not None:
+    data = data[:restrict]
+summary = {}
+# -------------- Validity -------------- #
+def is_valid(pred_mol_smiles, frag_smiles):
+    pred_mol = Chem.MolFromSmiles(pred_mol_smiles)
+    frag = Chem.MolFromSmiles(frag_smiles)
+    if frag is None:
+        return False
+    if pred_mol is None:
+        return False
+    try:
+        Chem.SanitizeMol(pred_mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_PROPERTIES)
+    except Exception:
+        return False
+    if len(pred_mol.GetSubstructMatch(frag)) != frag.GetNumAtoms():
+        return False
+    return True
+valid_cnt = 0
+total_cnt = 0
+for obj in tqdm(data):
+    valid = is_valid(obj['pred_molecule'], obj['fragments'])
+    obj['valid'] = valid
+    valid_cnt += valid
+    total_cnt += 1
+validity = valid_cnt / total_cnt * 100
+print(f'Validity: {validity:.3f}%')
+summary['validity'] = validity
+# ----------------- QED ------------------ #
+qed_values = []
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['qed'] = None
+        continue
+    qed = QED.qed(Chem.MolFromSmiles(obj['pred_molecule']))
+    obj['qed'] = qed
+    qed_values.append(qed)
+print(f'Mean QED: {np.mean(qed_values):.3f}')
+summary['qed'] = np.mean(qed_values)
+# ----------------- SA ------------------ #
+sa_values = []
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['sa'] = None
+        continue
+    sa = sascorer.calculateScore(Chem.MolFromSmiles(obj['pred_molecule']))
+    obj['sa'] = sa
+    sa_values.append(sa)
+print(f'Mean SA: {np.mean(sa_values):.3f}')
+summary['sa'] = np.mean(sa_values)
+# ----------------- Number of Rings ------------------ #
+rings_n_values = []
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['rings_n'] = None
+        continue
+    try:
+        rings_n = rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(obj['pred_linker']))
+    except:
+        continue
+    obj['rings_n'] = rings_n
+    rings_n_values.append(rings_n)
+print(f'Mean Number of Rings: {np.mean(rings_n_values):.3f}')
+summary['rings_n'] = np.mean(rings_n_values)
+# -------------- Uniqueness -------------- #
+true2samples = dict()
+for obj in tqdm(data):
+    if not obj['valid']:
+        continue
+    true_mol = obj['true_molecule']
+    true_frags = obj['fragments']
+    key = f'{true_mol}_{true_frags}'
+    true2samples.setdefault(key, []).append(obj['pred_molecule'])
+unique_cnt = 0
+total_cnt = 0
+for samples in tqdm(true2samples.values()):
+    unique_cnt += len(set(samples))
+    total_cnt += len(samples)
+uniqueness = unique_cnt / total_cnt * 100
+print(f'Uniqueness: {uniqueness:.3f}%')
+summary['uniqueness'] = uniqueness
+# ----------------- Novelty ---------------- #
+linkers_train = set()
+with open(train_set_path, 'r') as f:
+    for line in f:
+        linkers_train.add(line.strip())
+novel_cnt = 0
+total_cnt = 0
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['pred_linker_clean'] = None
+        obj['novel'] = False
+        continue
+    try:
+        linker = Chem.RemoveStereochemistry(obj['pred_linker'])
+        linker = MolStandardize.canonicalize_tautomer_smiles(Chem.MolToSmiles(linker))
+    except Exception:
+        linker = obj['pred_linker']
+    novel = linker not in linkers_train
+    obj['pred_linker_clean'] = linker
+    obj['novel'] = novel
+    novel_cnt += novel
+    total_cnt += 1
+novelty = novel_cnt / total_cnt * 100
+print(f'Novelty: {novelty:.3f}%')
+summary['novelty'] = novelty
+# ----------------- Recovery ---------------- #
+recovered_inputs = set()
+all_inputs = set()
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['recovered'] = False
+        continue
+    key = obj['true_molecule'] + '_' + obj['fragments']
+    try:
+        true_mol = Chem.MolFromSmiles(obj['true_molecule'])
+        Chem.RemoveStereochemistry(true_mol)
+        true_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(true_mol))
+    except:
+        true_mol = Chem.MolFromSmiles(obj['true_molecule'], sanitize=False)
+        Chem.RemoveStereochemistry(true_mol)
+        true_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(true_mol, sanitize=False))
+    pred_mol = Chem.MolFromSmiles(obj['pred_molecule'])
+    Chem.RemoveStereochemistry(pred_mol)
+    pred_mol_smi = Chem.MolToSmiles(Chem.RemoveHs(pred_mol))
+    recovered = true_mol_smi == pred_mol_smi
+    obj['recovered'] = recovered
+    if recovered:
+        recovered_inputs.add(key)
+    all_inputs.add(key)
+recovery = len(recovered_inputs) / len(all_inputs) * 100
+print(f'Recovery: {recovery:.3f}%')
+summary['recovery'] = recovery
+# ----------------- PAINS Filter ---------------- #
+def check_pains(mol, pains):
+    for pain in pains:
+        if mol.HasSubstructMatch(pain):
+            return False
+    return True
+with open(pains_smarts_loc, 'r') as f:
+    pains_smarts = [Chem.MolFromSmarts(line[0], mergeHs=True) for line in csv.reader(f)]
+    pains_smarts = set(pains_smarts)
+passed_pains_cnt = 0
+total_cnt = 0
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['passed_pains'] = False
+        continue
+    pred_mol = Chem.MolFromSmiles(obj['pred_molecule'])
+    passed_pains = check_pains(pred_mol, pains_smarts)
+    obj['passed_pains'] = passed_pains
+    passed_pains_cnt += passed_pains
+    total_cnt += 1
+pains_score = passed_pains_cnt / total_cnt * 100
+print(f'Passed PAINS: {pains_score:.3f}%')
+summary['pains'] = pains_score
+# ----------------- RA Filter ---------------- #
+def check_ring_filter(linker):
+    check = True
+    ssr = Chem.GetSymmSSSR(linker)
+    for ring in ssr:
+        for atom_idx in ring:
+            for bond in linker.GetAtomWithIdx(atom_idx).GetBonds():
+                if bond.GetBondType() == 2 and bond.GetBeginAtomIdx() in ring and bond.GetEndAtomIdx() in ring:
+                    check = False
+    return check
+passed_ring_filter_cnt = 0
+total_cnt = 0
+for obj in tqdm(data):
+    if not obj['valid']:
+        obj['passed_ring_filter'] = False
+        continue
+    pred_linker = Chem.MolFromSmiles(obj['pred_linker'], sanitize=False)
+    try:
+        passed_ring_filter = check_ring_filter(pred_linker)
+    except:
+        obj['passed_ring_filter'] = False
+        continue
+    obj['passed_ring_filter'] = passed_ring_filter
+    passed_ring_filter_cnt += passed_ring_filter
+    total_cnt += 1
+ra_score = passed_ring_filter_cnt / total_cnt * 100
+print(f'Passed Ring Filter: {ra_score:.3f}%')
+summary['ra'] = ra_score
+# ---------------------------- Saving -------------------------------- #
+out_path = gen_smi_file[:-3] + 'csv'
+table = pd.DataFrame(data)
+table.to_csv(out_path, index=False)
+summary_path = gen_smi_file[:-4] + '_summary.csv'
+summary_table = pd.DataFrame([summary])
+summary_table.to_csv(summary_path, index=False)
+# ----------------------- RMSD --------------------- #
+sdf_path = gen_smi_file[:-3] + 'sdf'
+pred_mol_3d = Chem.SDMolSupplier(sdf_path)
+if method == 'diffusion' and data_set == 'ZINC':
+    # Use SMILES of test set generated for molecules processed by OpenBabel
+    # (for consistency with other evaluation metrics)
+    # Because SMILES produced by our model are also based on OpenBabel
+    true_smi_path = 'datasets/zinc_final_test_smiles.smi'
+    true_mol_path = 'datasets/zinc_final_test_molecules.sdf'
+    true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
+    true_mol_3d = Chem.SDMolSupplier(true_mol_path)
+    true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
+elif method == 'diffusion' and data_set == 'CASF':
+    # Use SMILES of test set generated for molecules processed by OpenBabel
+    # (for consistency with other evaluation metrics)
+    # Because SMILES produced by our model are also based on OpenBabel
+    true_smi_path = 'datasets/casf_final_test_smiles.smi'
+    true_mol_path = 'datasets/casf_final_test_molecules.sdf'
+    true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
+    true_mol_3d = Chem.SDMolSupplier(true_mol_path)
+    true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
+elif method == 'diffusion' and data_set == 'GEOM':
+    # Use SMILES of test set generated for molecules processed by OpenBabel
+    # (for consistency with other evaluation metrics)
+    # Because SMILES produced by our model are also based on OpenBabel
+    true_smi_path = 'datasets/geom_multifrag_test_smiles.smi'
+    true_mol_path = 'datasets/geom_multifrag_test_molecules.sdf'
+    true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
+    true_mol_3d = Chem.SDMolSupplier(true_mol_path)
+    true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
+elif method == 'diffusion' and data_set == 'MOAD':
+    # Use SMILES of test set generated for molecules processed by OpenBabel
+    # (for consistency with other evaluation metrics)
+    # Because SMILES produced by our model are also based on OpenBabel
+    true_smi_path = 'datasets/MOAD_test_smiles.smi'
+    true_mol_path = 'datasets/MOAD_test_molecules.sdf'
+    true_smi = pd.read_csv(true_smi_path, sep=' ', names=['mol', 'frag']).mol.values
+    true_mol_3d = Chem.SDMolSupplier(true_mol_path)
+    true_smi2mol3d = dict(zip(true_smi, true_mol_3d))
+else:
+    raise NotImplementedError
+def find_exit(mol, num_frag):
+    neighbors = []
+    for atom_idx in range(num_frag, mol.GetNumAtoms()):
+        N = mol.GetAtoms()[atom_idx].GetNeighbors()
+        for n in N:
+            if n.GetIdx() < num_frag:
+                neighbors.append(n.GetIdx())
+    return neighbors
+rmsd_list = []
+for i, (obj, pred) in tqdm(enumerate(zip(data, pred_mol_3d)), total=len(data)):
+    obj['rmsd'] = None
+    if not obj['recovered']:
+        continue
+    true = true_smi2mol3d[obj['true_molecule']]
+    Chem.RemoveStereochemistry(true)
+    true = Chem.RemoveHs(true)
+    Chem.RemoveStereochemistry(pred)
+    pred = Chem.RemoveHs(pred)
+    G1 = frag_utils.topology_from_rdkit(pred)
+    G2 = frag_utils.topology_from_rdkit(true)
+    GM = isomorphism.GraphMatcher(G1, G2)
+    flag = GM.is_isomorphic()
+    frag_size = Chem.MolFromSmiles(obj['fragments']).GetNumAtoms()
+    # exits = find_exit(pred, frag_size)
+    # if flag and len(exits) == 2:
+    if flag:
+        error = Chem.rdMolAlign.GetBestRMS(pred, true)
+        # try:
+        #     error = Chem.rdMolAlign.GetBestRMS(pred, true)
+        # except:
+        #     set_trace()
+        num_linker = pred.GetNumAtoms() - frag_size
+        num_atoms = pred.GetNumAtoms()
+        error *= np.sqrt(num_atoms / num_linker)  # only count rmsd on linker
+        rmsd_list.append(error)
+        obj['rmsd'] = error
+rmsd_score = np.mean(rmsd_list)
+print(f'Mean RMSD: {rmsd_score:.3f}')
+summary['rmsd'] = rmsd_score
+# ----------------------------- SC-RDKit -------------------------- #
+def calc_sc_rdkit_full_mol(gen_mol, ref_mol):
+    try:
+        _ = rdMolAlign.GetO3A(gen_mol, ref_mol).Align()
+        sc_score = calc_SC_RDKit.calc_SC_RDKit_score(gen_mol, ref_mol)
+        return sc_score
+    except:
+        return -0.5
+sc_rdkit_list = []
+for i, (obj, pred) in tqdm(enumerate(zip(data, pred_mol_3d)), total=len(data)):
+    obj['sc_rdkit'] = None
+    if not obj['valid']:
+        continue
+    true = true_smi2mol3d[obj['true_molecule']]
+    score = calc_sc_rdkit_full_mol(pred, true)
+    sc_rdkit_list.append(score)
+    obj['sc_rdkit'] = score
+sc_rdkit_list = np.array(sc_rdkit_list)
+sc_rdkit_7 = (sc_rdkit_list > 0.7).sum() / len(sc_rdkit_list) * 100
+sc_rdkit_8 = (sc_rdkit_list > 0.8).sum() / len(sc_rdkit_list) * 100
+sc_rdkit_9 = (sc_rdkit_list > 0.9).sum() / len(sc_rdkit_list) * 100
+sc_rdkit_mean = np.mean(sc_rdkit_list)
+print(f'SC_RDKit > 0.7: {sc_rdkit_7:3f}%')
+print(f'SC_RDKit > 0.8: {sc_rdkit_8:3f}%')
+print(f'SC_RDKit > 0.9: {sc_rdkit_9:3f}%')
+print(f'Mean SC_RDKit: {sc_rdkit_mean}')
+summary['sc_rdkit_7'] = sc_rdkit_7
+summary['sc_rdkit_8'] = sc_rdkit_8
+summary['sc_rdkit_9'] = sc_rdkit_9
+summary['sc_rdkit_mean'] = sc_rdkit_mean
+# ---------------------------- Saving -------------------------------- #
+out_path = gen_smi_file[:-3] + 'csv'
+table = pd.DataFrame(data)
+table.to_csv(out_path, index=False)
+summary_path = gen_smi_file[:-4] + '_summary.csv'
+summary_table = pd.DataFrame([summary])
+summary_table.to_csv(summary_path, index=False)

confidence/confidence_train.py DELETED Viewed

@@ -1,320 +0,0 @@
-import gc
-import math
-import os
-import shutil
-from argparse import Namespace, ArgumentParser, FileType
-import torch.nn.functional as F
-import wandb
-import torch
-from sklearn.metrics import roc_auc_score
-from torch_geometric.loader import DataListLoader, DataLoader
-from tqdm import tqdm
-from confidence.dataset import ConfidenceDataset
-from utils.training import AverageMeter
-torch.multiprocessing.set_sharing_strategy('file_system')
-import yaml
-from utils.utils import save_yaml_file, get_optimizer_and_scheduler, get_model
-parser = ArgumentParser()
-parser.add_argument('--config', type=FileType(mode='r'), default=None)
-parser.add_argument('--original_model_dir', type=str, default='workdir', help='Path to folder with trained model and hyperparameters')
-parser.add_argument('--restart_dir', type=str, default=None, help='')
-parser.add_argument('--use_original_model_cache', action='store_true', default=False, help='If this is true, the same dataset as in the original model will be used. Otherwise, the dataset parameters are used.')
-parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed/', help='Folder containing original structures')
-parser.add_argument('--ckpt', type=str, default='best_model.pt', help='Checkpoint to use inside the folder')
-parser.add_argument('--model_save_frequency', type=int, default=0, help='Frequency with which to save the last model. If 0, then only the early stopping criterion best model is saved and overwritten.')
-parser.add_argument('--best_model_save_frequency', type=int, default=0, help='Frequency with which to save the best model. If 0, then only the early stopping criterion best model is saved and overwritten.')
-parser.add_argument('--run_name', type=str, default='test_confidence', help='')
-parser.add_argument('--project', type=str, default='diffdock_confidence', help='')
-parser.add_argument('--split_train', type=str, default='data/splits/timesplit_no_lig_overlap_train', help='Path of file defining the split')
-parser.add_argument('--split_val', type=str, default='data/splits/timesplit_no_lig_overlap_val', help='Path of file defining the split')
-parser.add_argument('--split_test', type=str, default='data/splits/timesplit_test', help='Path of file defining the split')
-# Inference parameters for creating the positions and rmsds that the confidence predictor will be trained on.
-parser.add_argument('--cache_path', type=str, default='data/cacheNew', help='Folder from where to load/restore cached dataset')
-parser.add_argument('--cache_ids_to_combine', nargs='+', type=str, default=None, help='RMSD value below which a prediction is considered a postitive. This can also be multiple cutoffs.')
-parser.add_argument('--cache_creation_id', type=int, default=None, help='number of times that inference is run on the full dataset before concatenating it and coming up with the full confidence dataset')
-parser.add_argument('--wandb', action='store_true', default=False, help='')
-parser.add_argument('--inference_steps', type=int, default=2, help='Number of denoising steps')
-parser.add_argument('--samples_per_complex', type=int, default=3, help='')
-parser.add_argument('--balance', action='store_true', default=False, help='If this is true than we do not force the samples seen during training to be the same amount of negatives as positives')
-parser.add_argument('--rmsd_prediction', action='store_true', default=False, help='')
-parser.add_argument('--rmsd_classification_cutoff', nargs='+', type=float, default=2, help='RMSD value below which a prediction is considered a postitive. This can also be multiple cutoffs.')
-parser.add_argument('--log_dir', type=str, default='workdir', help='')
-parser.add_argument('--main_metric', type=str, default='accuracy', help='Metric to track for early stopping. Mostly [loss, accuracy, ROC AUC]')
-parser.add_argument('--main_metric_goal', type=str, default='max', help='Can be [min, max]')
-parser.add_argument('--transfer_weights', action='store_true', default=False, help='')
-parser.add_argument('--batch_size', type=int, default=5, help='')
-parser.add_argument('--lr', type=float, default=1e-3, help='')
-parser.add_argument('--w_decay', type=float, default=0.0, help='')
-parser.add_argument('--scheduler', type=str, default='plateau', help='')
-parser.add_argument('--scheduler_patience', type=int, default=20, help='')
-parser.add_argument('--n_epochs', type=int, default=5, help='')
-# Dataset
-parser.add_argument('--limit_complexes', type=int, default=0, help='')
-parser.add_argument('--all_atoms', action='store_true', default=True, help='')
-parser.add_argument('--multiplicity', type=int, default=1, help='')
-parser.add_argument('--chain_cutoff', type=float, default=10, help='')
-parser.add_argument('--receptor_radius', type=float, default=30, help='')
-parser.add_argument('--c_alpha_max_neighbors', type=int, default=10, help='')
-parser.add_argument('--atom_radius', type=float, default=5, help='')
-parser.add_argument('--atom_max_neighbors', type=int, default=8, help='')
-parser.add_argument('--matching_popsize', type=int, default=20, help='')
-parser.add_argument('--matching_maxiter', type=int, default=20, help='')
-parser.add_argument('--max_lig_size', type=int, default=None, help='Maximum number of heavy atoms')
-parser.add_argument('--remove_hs', action='store_true', default=False, help='remove Hs')
-parser.add_argument('--num_conformers', type=int, default=1, help='')
-parser.add_argument('--esm_embeddings_path', type=str, default=None,help='If this is set then the LM embeddings at that path will be used for the receptor features')
-parser.add_argument('--no_torsion', action='store_true', default=False, help='')
-# Model
-parser.add_argument('--num_conv_layers', type=int, default=2, help='Number of interaction layers')
-parser.add_argument('--max_radius', type=float, default=5.0, help='Radius cutoff for geometric graph')
-parser.add_argument('--scale_by_sigma', action='store_true', default=True, help='Whether to normalise the score')
-parser.add_argument('--ns', type=int, default=16, help='Number of hidden features per node of order 0')
-parser.add_argument('--nv', type=int, default=4, help='Number of hidden features per node of order >0')
-parser.add_argument('--distance_embed_dim', type=int, default=32, help='')
-parser.add_argument('--cross_distance_embed_dim', type=int, default=32, help='')
-parser.add_argument('--no_batch_norm', action='store_true', default=False, help='If set, it removes the batch norm')
-parser.add_argument('--use_second_order_repr', action='store_true', default=False, help='Whether to use only up to first order representations or also second')
-parser.add_argument('--cross_max_distance', type=float, default=80, help='')
-parser.add_argument('--dynamic_max_cross', action='store_true', default=False, help='')
-parser.add_argument('--dropout', type=float, default=0.0, help='MLP dropout')
-parser.add_argument('--embedding_type', type=str, default="sinusoidal", help='')
-parser.add_argument('--sigma_embed_dim', type=int, default=32, help='')
-parser.add_argument('--embedding_scale', type=int, default=10000, help='')
-parser.add_argument('--confidence_no_batchnorm', action='store_true', default=False, help='')
-parser.add_argument('--confidence_dropout', type=float, default=0.0, help='MLP dropout in confidence readout')
-args = parser.parse_args()
-if args.config:
-    config_dict = yaml.load(args.config, Loader=yaml.FullLoader)
-    arg_dict = args.__dict__
-    for key, value in config_dict.items():
-        if isinstance(value, list):
-            for v in value:
-                arg_dict[key].append(v)
-        else:
-            arg_dict[key] = value
-    args.config = args.config.name
-assert(args.main_metric_goal == 'max' or args.main_metric_goal == 'min')
-def train_epoch(model, loader, optimizer, rmsd_prediction):
-    model.train()
-    meter = AverageMeter(['confidence_loss'])
-    for data in tqdm(loader, total=len(loader)):
-        if device.type == 'cuda' and len(data) % torch.cuda.device_count() == 1 or device.type == 'cpu' and data.num_graphs == 1:
-            print("Skipping batch of size 1 since otherwise batchnorm would not work.")
-        optimizer.zero_grad()
-        try:
-            pred = model(data)
-            if rmsd_prediction:
-                labels = torch.cat([graph.rmsd for graph in data]).to(device) if isinstance(data, list) else data.rmsd
-                confidence_loss = F.mse_loss(pred, labels)
-            else:
-                if isinstance(args.rmsd_classification_cutoff, list):
-                    labels = torch.cat([graph.y_binned for graph in data]).to(device) if isinstance(data, list) else data.y_binned
-                    confidence_loss = F.cross_entropy(pred, labels)
-                else:
-                    labels = torch.cat([graph.y for graph in data]).to(device) if isinstance(data, list) else data.y
-                    confidence_loss = F.binary_cross_entropy_with_logits(pred, labels)
-            confidence_loss.backward()
-            optimizer.step()
-            meter.add([confidence_loss.cpu().detach()])
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                print('| WARNING: ran out of memory, skipping batch')
-                for p in model.parameters():
-                    if p.grad is not None:
-                        del p.grad  # free some memory
-                torch.cuda.empty_cache()
-                gc.collect()
-                continue
-            else:
-                raise e
-    return meter.summary()
-def test_epoch(model, loader, rmsd_prediction):
-    model.eval()
-    meter = AverageMeter(['loss'], unpooled_metrics=True) if rmsd_prediction else AverageMeter(['confidence_loss', 'accuracy', 'ROC AUC'], unpooled_metrics=True)
-    all_labels = []
-    for data in tqdm(loader, total=len(loader)):
-        try:
-            with torch.no_grad():
-                pred = model(data)
-            affinity_loss = torch.tensor(0.0, dtype=torch.float, device=pred[0].device)
-            accuracy = torch.tensor(0.0, dtype=torch.float, device=pred[0].device)
-            if rmsd_prediction:
-                labels = torch.cat([graph.rmsd for graph in data]).to(device) if isinstance(data, list) else data.rmsd
-                confidence_loss = F.mse_loss(pred, labels)
-                meter.add([confidence_loss.cpu().detach()])
-            else:
-                if isinstance(args.rmsd_classification_cutoff, list):
-                    labels = torch.cat([graph.y_binned for graph in data]).to(device) if isinstance(data,list) else data.y_binned
-                    confidence_loss = F.cross_entropy(pred, labels)
-                else:
-                    labels = torch.cat([graph.y for graph in data]).to(device) if isinstance(data, list) else data.y
-                    confidence_loss = F.binary_cross_entropy_with_logits(pred, labels)
-                    accuracy = torch.mean((labels == (pred > 0).float()).float())
-                try:
-                    roc_auc = roc_auc_score(labels.detach().cpu().numpy(), pred.detach().cpu().numpy())
-                except ValueError as e:
-                    if 'Only one class present in y_true. ROC AUC score is not defined in that case.' in str(e):
-                        roc_auc = 0
-                    else:
-                        raise e
-            meter.add([confidence_loss.cpu().detach(), accuracy.cpu().detach(), torch.tensor(roc_auc)])
-            all_labels.append(labels)
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                print('| WARNING: ran out of memory, skipping batch')
-                for p in model.parameters():
-                    if p.grad is not None:
-                        del p.grad  # free some memory
-                torch.cuda.empty_cache()
-                continue
-            else:
-                raise e
-    all_labels = torch.cat(all_labels)
-    if rmsd_prediction:
-        baseline_metric = ((all_labels - all_labels.mean()).abs()).mean()
-    else:
-        baseline_metric = all_labels.sum() / len(all_labels)
-    results = meter.summary()
-    results.update({'baseline_metric': baseline_metric})
-    return meter.summary(), baseline_metric
-def train(args, model, optimizer, scheduler, train_loader, val_loader, run_dir):
-    best_val_metric = math.inf if args.main_metric_goal == 'min' else 0
-    best_epoch = 0
-    print("Starting training...")
-    for epoch in range(args.n_epochs):
-        logs = {}
-        train_metrics = train_epoch(model, train_loader, optimizer, args.rmsd_prediction)
-        print("Epoch {}: Training loss {:.4f}".format(epoch, train_metrics['confidence_loss']))
-        val_metrics, baseline_metric = test_epoch(model, val_loader, args.rmsd_prediction)
-        if args.rmsd_prediction:
-            print("Epoch {}: Validation loss {:.4f}".format(epoch, val_metrics['confidence_loss']))
-        else:
-            print("Epoch {}: Validation loss {:.4f}  accuracy {:.4f}".format(epoch, val_metrics['confidence_loss'], val_metrics['accuracy']))
-        if args.wandb:
-            logs.update({'valinf_' + k: v for k, v in val_metrics.items()}, step=epoch + 1)
-            logs.update({'train_' + k: v for k, v in train_metrics.items()}, step=epoch + 1)
-            logs.update({'mean_rmsd' if args.rmsd_prediction else 'fraction_positives': baseline_metric,
-                         'current_lr': optimizer.param_groups[0]['lr']})
-            wandb.log(logs, step=epoch + 1)
-        if scheduler:
-            scheduler.step(val_metrics[args.main_metric])
-        state_dict = model.module.state_dict() if device.type == 'cuda' else model.state_dict()
-        if args.main_metric_goal == 'min' and val_metrics[args.main_metric] < best_val_metric or \
-                args.main_metric_goal == 'max' and val_metrics[args.main_metric] > best_val_metric:
-            best_val_metric = val_metrics[args.main_metric]
-            best_epoch = epoch
-            torch.save(state_dict, os.path.join(run_dir, 'best_model.pt'))
-        if args.model_save_frequency > 0 and (epoch + 1) % args.model_save_frequency == 0:
-            torch.save(state_dict, os.path.join(run_dir, f'model_epoch{epoch+1}.pt'))
-        if args.best_model_save_frequency > 0 and (epoch + 1) % args.best_model_save_frequency == 0:
-            shutil.copyfile(os.path.join(run_dir, 'best_model.pt'), os.path.join(run_dir, f'best_model_epoch{epoch+1}.pt'))
-        torch.save({
-            'epoch': epoch,
-            'model': state_dict,
-            'optimizer': optimizer.state_dict(),
-        }, os.path.join(run_dir, 'last_model.pt'))
-    print("Best Validation accuracy {} on Epoch {}".format(best_val_metric, best_epoch))
-def construct_loader_confidence(args, device):
-    common_args = {'cache_path': args.cache_path, 'original_model_dir': args.original_model_dir, 'device': device,
-                   'inference_steps': args.inference_steps, 'samples_per_complex': args.samples_per_complex,
-                   'limit_complexes': args.limit_complexes, 'all_atoms': args.all_atoms, 'balance': args.balance,
-                   'rmsd_classification_cutoff': args.rmsd_classification_cutoff, 'use_original_model_cache': args.use_original_model_cache,
-                   'cache_creation_id': args.cache_creation_id, "cache_ids_to_combine": args.cache_ids_to_combine,
-                   "model_ckpt": args.ckpt}
-    loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
-    exception_flag = False
-    try:
-        train_dataset = ConfidenceDataset(split="train", args=args, **common_args)
-        train_loader = loader_class(dataset=train_dataset, batch_size=args.batch_size, shuffle=True)
-    except Exception as e:
-        if 'The generated ligand positions with cache_id do not exist:' in str(e):
-            print("HAPPENING | Encountered the following exception when loading the confidence train dataset:")
-            print(str(e))
-            print("HAPPENING | We are still continuing because we want to try to generate the validation dataset if it has not been created yet:")
-            exception_flag = True
-        else: raise e
-    val_dataset = ConfidenceDataset(split="val", args=args, **common_args)
-    val_loader = loader_class(dataset=val_dataset, batch_size=args.batch_size, shuffle=True)
-    if exception_flag: raise Exception('We encountered the exception during train dataset loading: ', e)
-    return train_loader, val_loader
-if __name__ == '__main__':
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    with open(f'{args.original_model_dir}/model_parameters.yml') as f:
-        score_model_args = Namespace(**yaml.full_load(f))
-    # construct loader
-    train_loader, val_loader = construct_loader_confidence(args, device)
-    model = get_model(score_model_args if args.transfer_weights else args, device, t_to_sigma=None, confidence_mode=True)
-    optimizer, scheduler = get_optimizer_and_scheduler(args, model, scheduler_mode=args.main_metric_goal)
-    if args.transfer_weights:
-        print("HAPPENING | Transferring weights from original_model_dir to the new model after using original_model_dir's arguments to construct the new model.")
-        checkpoint = torch.load(os.path.join(args.original_model_dir,args.ckpt), map_location=device)
-        model_state_dict = model.state_dict()
-        transfer_weights_dict = {k: v for k, v in checkpoint.items() if k in list(model_state_dict.keys())}
-        model_state_dict.update(transfer_weights_dict)  # update the layers with the pretrained weights
-        model.load_state_dict(model_state_dict)
-    elif args.restart_dir:
-        dict = torch.load(f'{args.restart_dir}/last_model.pt', map_location=torch.device('cpu'))
-        model.module.load_state_dict(dict['model'], strict=True)
-        optimizer.load_state_dict(dict['optimizer'])
-        print("Restarting from epoch", dict['epoch'])
-    numel = sum([p.numel() for p in model.parameters()])
-    print('Model with', numel, 'parameters')
-    if args.wandb:
-        wandb.init(
-            entity='entity',
-            settings=wandb.Settings(start_method="fork"),
-            project=args.project,
-            name=args.run_name,
-            config=args
-        )
-        wandb.log({'numel': numel})
-    # record parameters
-    run_dir = os.path.join(args.log_dir, args.run_name)
-    yaml_file_name = os.path.join(run_dir, 'model_parameters.yml')
-    save_yaml_file(yaml_file_name, args.__dict__)
-    args.device = device
-    train(args, model, optimizer, scheduler, train_loader, val_loader, run_dir)

confidence/dataset.py DELETED Viewed

@@ -1,276 +0,0 @@
-import itertools
-import math
-import os
-import pickle
-import random
-from argparse import Namespace
-from functools import partial
-import copy
-import numpy as np
-import pandas as pd
-import torch
-import yaml
-from torch_geometric.data import Dataset, Data
-from torch_geometric.loader import DataLoader
-from tqdm import tqdm
-from datasets.pdbbind import PDBBind
-from utils.diffusion_utils import get_t_schedule
-from utils.sampling import randomize_position, sampling
-from utils.utils import get_model
-from utils.diffusion_utils import t_to_sigma as t_to_sigma_compl
-class ListDataset(Dataset):
-    def __init__(self, list):
-        super().__init__()
-        self.data_list = list
-    def len(self) -> int:
-        return len(self.data_list)
-    def get(self, idx: int) -> Data:
-        return self.data_list[idx]
-def get_cache_path(args, split):
-    cache_path = args.cache_path
-    if not args.no_torsion:
-        cache_path += '_torsion'
-    if args.all_atoms:
-        cache_path += '_allatoms'
-    split_path = args.split_train if split == 'train' else args.split_val
-    cache_path = os.path.join(cache_path, f'limit{args.limit_complexes}_INDEX{os.path.splitext(os.path.basename(split_path))[0]}_maxLigSize{args.max_lig_size}_H{int(not args.remove_hs)}_recRad{args.receptor_radius}_recMax{args.c_alpha_max_neighbors}'
-                                       + ('' if not args.all_atoms else f'_atomRad{args.atom_radius}_atomMax{args.atom_max_neighbors}')
-                                       + ('' if args.no_torsion or args.num_conformers == 1 else
-                                           f'_confs{args.num_conformers}')
-                              + ('' if args.esm_embeddings_path is None else f'_esmEmbeddings'))
-    return cache_path
-def get_args_and_cache_path(original_model_dir, split):
-    with open(f'{original_model_dir}/model_parameters.yml') as f:
-        model_args = Namespace(**yaml.full_load(f))
-    return model_args, get_cache_path(model_args,split)
-class ConfidenceDataset(Dataset):
-    def __init__(self, cache_path, original_model_dir, split, device, limit_complexes,
-                 inference_steps, samples_per_complex, all_atoms,
-                 args, model_ckpt, balance=False, use_original_model_cache=True, rmsd_classification_cutoff=2,
-                 cache_ids_to_combine=None, cache_creation_id=None):
-        super(ConfidenceDataset, self).__init__()
-        self.device = device
-        self.inference_steps = inference_steps
-        self.limit_complexes = limit_complexes
-        self.all_atoms = all_atoms
-        self.original_model_dir = original_model_dir
-        self.balance = balance
-        self.use_original_model_cache = use_original_model_cache
-        self.rmsd_classification_cutoff = rmsd_classification_cutoff
-        self.cache_ids_to_combine = cache_ids_to_combine
-        self.cache_creation_id = cache_creation_id
-        self.samples_per_complex = samples_per_complex
-        self.model_ckpt = model_ckpt
-        self.original_model_args, original_model_cache = get_args_and_cache_path(original_model_dir, split)
-        self.complex_graphs_cache = original_model_cache if self.use_original_model_cache else get_cache_path(args, split)
-        # check if the docked positions have already been computed, if not run the preprocessing (docking every complex)
-        self.full_cache_path = os.path.join(cache_path, f'model_{os.path.splitext(os.path.basename(original_model_dir))[0]}'
-                                            f'_split_{split}_limit_{limit_complexes}')
-        if (not os.path.exists(os.path.join(self.full_cache_path, "ligand_positions.pkl")) and self.cache_creation_id is None) or \
-                (not os.path.exists(os.path.join(self.full_cache_path, f"ligand_positions_id{self.cache_creation_id}.pkl")) and self.cache_creation_id is not None):
-            os.makedirs(self.full_cache_path, exist_ok=True)
-            self.preprocessing(original_model_cache)
-        # load the graphs that the confidence model will use
-        print('Using the cached complex graphs of the original model args' if self.use_original_model_cache else 'Not using the cached complex graphs of the original model args. Instead the complex graphs are used that are at the location given by the dataset parameters given to confidence_train.py')
-        print(self.complex_graphs_cache)
-        if not os.path.exists(os.path.join(self.complex_graphs_cache, "heterographs.pkl")):
-            print(f'HAPPENING | Complex graphs path does not exist yet: {os.path.join(self.complex_graphs_cache, "heterographs.pkl")}. For that reason, we are now creating the dataset.')
-            PDBBind(transform=None, root=args.data_dir, limit_complexes=args.limit_complexes,
-                    receptor_radius=args.receptor_radius,
-                    cache_path=args.cache_path, split_path=args.split_val if split == 'val' else args.split_train,
-                    remove_hs=args.remove_hs, max_lig_size=None,
-                    c_alpha_max_neighbors=args.c_alpha_max_neighbors,
-                    matching=not args.no_torsion, keep_original=True,
-                    popsize=args.matching_popsize,
-                    maxiter=args.matching_maxiter,
-                    all_atoms=args.all_atoms,
-                    atom_radius=args.atom_radius,
-                    atom_max_neighbors=args.atom_max_neighbors,
-                    esm_embeddings_path=args.esm_embeddings_path,
-                    require_ligand=True)
-        print(f'HAPPENING | Loading complex graphs from: {os.path.join(self.complex_graphs_cache, "heterographs.pkl")}')
-        with open(os.path.join(self.complex_graphs_cache, "heterographs.pkl"), 'rb') as f:
-            complex_graphs = pickle.load(f)
-        self.complex_graph_dict = {d.name: d for d in complex_graphs}
-        if self.cache_ids_to_combine is None:
-            print(f'HAPPENING | Loading positions and rmsds from: {os.path.join(self.full_cache_path, "ligand_positions.pkl")}')
-            with open(os.path.join(self.full_cache_path, "ligand_positions.pkl"), 'rb') as f:
-                self.full_ligand_positions, self.rmsds = pickle.load(f)
-            if os.path.exists(os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl")):
-                with open(os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl"), 'rb') as f:
-                    generated_rmsd_complex_names = pickle.load(f)
-            else:
-                print('HAPPENING | The path, ', os.path.join(self.full_cache_path, "complex_names_in_same_order.pkl"),
-                      ' does not exist. \n => We assume that means that we are using a ligand_positions.pkl where the '
-                      'code was not saving the complex names for them yet. We now instead use the complex names of '
-                      'the dataset that the original model used to create the ligand positions and RMSDs.')
-                with open(os.path.join(original_model_cache, "heterographs.pkl"), 'rb') as f:
-                    original_model_complex_graphs = pickle.load(f)
-                    generated_rmsd_complex_names = [d.name for d in original_model_complex_graphs]
-            assert (len(self.rmsds) == len(generated_rmsd_complex_names))
-        else:
-            all_rmsds_unsorted, all_full_ligand_positions_unsorted, all_names_unsorted = [], [], []
-            for idx, cache_id in enumerate(self.cache_ids_to_combine):
-                print(f'HAPPENING | Loading positions and rmsds from cache_id from the path: {os.path.join(self.full_cache_path, "ligand_positions_"+ str(cache_id)+ ".pkl")}')
-                if not os.path.exists(os.path.join(self.full_cache_path, f"ligand_positions_id{cache_id}.pkl")): raise Exception(f'The generated ligand positions with cache_id do not exist: {cache_id}') # be careful with changing this error message since it is sometimes cought in a try catch
-                with open(os.path.join(self.full_cache_path, f"ligand_positions_id{cache_id}.pkl"), 'rb') as f:
-                    full_ligand_positions, rmsds = pickle.load(f)
-                with open(os.path.join(self.full_cache_path, f"complex_names_in_same_order_id{cache_id}.pkl"), 'rb') as f:
-                    names_unsorted = pickle.load(f)
-                all_names_unsorted.append(names_unsorted)
-                all_rmsds_unsorted.append(rmsds)
-                all_full_ligand_positions_unsorted.append(full_ligand_positions)
-            names_order = list(set(sum(all_names_unsorted, [])))
-            all_rmsds, all_full_ligand_positions, all_names = [], [], []
-            for idx, (rmsds_unsorted, full_ligand_positions_unsorted, names_unsorted) in enumerate(zip(all_rmsds_unsorted,all_full_ligand_positions_unsorted, all_names_unsorted)):
-                name_to_pos_dict = {name: (rmsd, pos) for name, rmsd, pos in zip(names_unsorted, full_ligand_positions_unsorted, rmsds_unsorted) }
-                intermediate_rmsds = [name_to_pos_dict[name][1] for name in names_order]
-                all_rmsds.append((intermediate_rmsds))
-                intermediate_pos = [name_to_pos_dict[name][0] for name in names_order]
-                all_full_ligand_positions.append((intermediate_pos))
-            self.full_ligand_positions, self.rmsds = [], []
-            for positions_tuple in list(zip(*all_full_ligand_positions)):
-                self.full_ligand_positions.append(np.concatenate(positions_tuple, axis=0))
-            for positions_tuple in list(zip(*all_rmsds)):
-                self.rmsds.append(np.concatenate(positions_tuple, axis=0))
-            generated_rmsd_complex_names = names_order
-        print('Number of complex graphs: ', len(self.complex_graph_dict))
-        print('Number of RMSDs and positions for the complex graphs: ', len(self.full_ligand_positions))
-        self.all_samples_per_complex = samples_per_complex * (1 if self.cache_ids_to_combine is None else len(self.cache_ids_to_combine))
-        self.positions_rmsds_dict = {name: (pos, rmsd) for name, pos, rmsd in zip (generated_rmsd_complex_names, self.full_ligand_positions, self.rmsds)}
-        self.dataset_names = list(set(self.positions_rmsds_dict.keys()) & set(self.complex_graph_dict.keys()))
-        if limit_complexes > 0:
-            self.dataset_names = self.dataset_names[:limit_complexes]
-    def len(self):
-        return len(self.dataset_names)
-    def get(self, idx):
-        complex_graph = copy.deepcopy(self.complex_graph_dict[self.dataset_names[idx]])
-        positions, rmsds = self.positions_rmsds_dict[self.dataset_names[idx]]
-        if self.balance:
-            if isinstance(self.rmsd_classification_cutoff, list): raise ValueError("a list for --rmsd_classification_cutoff can only be used without --balance")
-            label = random.randint(0, 1)
-            success = rmsds < self.rmsd_classification_cutoff
-            n_success = np.count_nonzero(success)
-            if label == 0 and n_success != self.all_samples_per_complex:
-                # sample negative complex
-                sample = random.randint(0, self.all_samples_per_complex - n_success - 1)
-                lig_pos = positions[~success][sample]
-                complex_graph['ligand'].pos = torch.from_numpy(lig_pos)
-            else:
-                # sample positive complex
-                if n_success > 0: # if no successfull sample returns the matched complex
-                    sample = random.randint(0, n_success - 1)
-                    lig_pos = positions[success][sample]
-                    complex_graph['ligand'].pos = torch.from_numpy(lig_pos)
-            complex_graph.y = torch.tensor(label).float()
-        else:
-            sample = random.randint(0, self.all_samples_per_complex - 1)
-            complex_graph['ligand'].pos = torch.from_numpy(positions[sample])
-            complex_graph.y = torch.tensor(rmsds[sample] < self.rmsd_classification_cutoff).float().unsqueeze(0)
-            if isinstance(self.rmsd_classification_cutoff, list):
-                complex_graph.y_binned = torch.tensor(np.logical_and(rmsds[sample] < self.rmsd_classification_cutoff + [math.inf],rmsds[sample] >= [0] + self.rmsd_classification_cutoff), dtype=torch.float).unsqueeze(0)
-                complex_graph.y = torch.tensor(rmsds[sample] < self.rmsd_classification_cutoff[0]).unsqueeze(0).float()
-            complex_graph.rmsd = torch.tensor(rmsds[sample]).unsqueeze(0).float()
-        complex_graph['ligand'].node_t = {'tr': 0 * torch.ones(complex_graph['ligand'].num_nodes),
-                                          'rot': 0 * torch.ones(complex_graph['ligand'].num_nodes),
-                                          'tor': 0 * torch.ones(complex_graph['ligand'].num_nodes)}
-        complex_graph['receptor'].node_t = {'tr': 0 * torch.ones(complex_graph['receptor'].num_nodes),
-                                            'rot': 0 * torch.ones(complex_graph['receptor'].num_nodes),
-                                            'tor': 0 * torch.ones(complex_graph['receptor'].num_nodes)}
-        if self.all_atoms:
-            complex_graph['atom'].node_t = {'tr': 0 * torch.ones(complex_graph['atom'].num_nodes),
-                                            'rot': 0 * torch.ones(complex_graph['atom'].num_nodes),
-                                            'tor': 0 * torch.ones(complex_graph['atom'].num_nodes)}
-        complex_graph.complex_t = {'tr': 0 * torch.ones(1), 'rot': 0 * torch.ones(1), 'tor': 0 * torch.ones(1)}
-        return complex_graph
-    def preprocessing(self, original_model_cache):
-        t_to_sigma = partial(t_to_sigma_compl, args=self.original_model_args)
-        model = get_model(self.original_model_args, self.device, t_to_sigma=t_to_sigma, no_parallel=True)
-        state_dict = torch.load(f'{self.original_model_dir}/{self.model_ckpt}', map_location=torch.device('cpu'))
-        model.load_state_dict(state_dict, strict=True)
-        model = model.to(self.device)
-        model.eval()
-        tr_schedule = get_t_schedule(inference_steps=self.inference_steps)
-        rot_schedule = tr_schedule
-        tor_schedule = tr_schedule
-        print('common t schedule', tr_schedule)
-        print('HAPPENING | loading cached complexes of the original model to create the confidence dataset RMSDs and predicted positions. Doing that from: ', os.path.join(self.complex_graphs_cache, "heterographs.pkl"))
-        with open(os.path.join(original_model_cache, "heterographs.pkl"), 'rb') as f:
-            complex_graphs = pickle.load(f)
-        dataset = ListDataset(complex_graphs)
-        loader = DataLoader(dataset=dataset, batch_size=1, shuffle=False)
-        rmsds, full_ligand_positions, names = [], [], []
-        for idx, orig_complex_graph in tqdm(enumerate(loader)):
-            data_list = [copy.deepcopy(orig_complex_graph) for _ in range(self.samples_per_complex)]
-            randomize_position(data_list, self.original_model_args.no_torsion, False, self.original_model_args.tr_sigma_max)
-            predictions_list = None
-            failed_convergence_counter = 0
-            while predictions_list is None:
-                try:
-                    predictions_list, confidences = sampling(data_list=data_list, model=model, inference_steps=self.inference_steps,
-                                                             tr_schedule=tr_schedule, rot_schedule=rot_schedule, tor_schedule=tor_schedule,
-                                                             device=self.device, t_to_sigma=t_to_sigma, model_args=self.original_model_args)
-                except Exception as e:
-                    if 'failed to converge' in str(e):
-                        failed_convergence_counter += 1
-                        if failed_convergence_counter > 5:
-                            print('| WARNING: SVD failed to converge 5 times - skipping the complex')
-                            break
-                        print('| WARNING: SVD failed to converge - trying again with a new sample')
-                    else:
-                        raise e
-            if failed_convergence_counter > 5: predictions_list = data_list
-            if self.original_model_args.no_torsion:
-                orig_complex_graph['ligand'].orig_pos = (orig_complex_graph['ligand'].pos.cpu().numpy() + orig_complex_graph.original_center.cpu().numpy())
-            filterHs = torch.not_equal(predictions_list[0]['ligand'].x[:, 0], 0).cpu().numpy()
-            if isinstance(orig_complex_graph['ligand'].orig_pos, list):
-                orig_complex_graph['ligand'].orig_pos = orig_complex_graph['ligand'].orig_pos[0]
-            ligand_pos = np.asarray([complex_graph['ligand'].pos.cpu().numpy()[filterHs] for complex_graph in predictions_list])
-            orig_ligand_pos = np.expand_dims(orig_complex_graph['ligand'].orig_pos[filterHs] - orig_complex_graph.original_center.cpu().numpy(), axis=0)
-            rmsd = np.sqrt(((ligand_pos - orig_ligand_pos) ** 2).sum(axis=2).mean(axis=1))
-            rmsds.append(rmsd)
-            full_ligand_positions.append(np.asarray([complex_graph['ligand'].pos.cpu().numpy() for complex_graph in predictions_list]))
-            names.append(orig_complex_graph.name[0])
-            assert(len(orig_complex_graph.name) == 1) # I just put this assert here because of the above line where I assumed that the list is always only lenght 1. Just in case it isn't maybe check what the names in there are.
-        with open(os.path.join(self.full_cache_path, f"ligand_positions{'' if self.cache_creation_id is None else '_id' + str(self.cache_creation_id)}.pkl"), 'wb') as f:
-            pickle.dump((full_ligand_positions, rmsds), f)
-        with open(os.path.join(self.full_cache_path, f"complex_names_in_same_order{'' if self.cache_creation_id is None else '_id' + str(self.cache_creation_id)}.pkl"), 'wb') as f:
-            pickle.dump((names), f)

{confidence → datasets}/__init__.py RENAMED Viewed

File without changes

datasets/conformer_matching.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import copy, time
+import numpy as np
+from collections import defaultdict
+from rdkit import Chem, RDLogger
+from rdkit.Chem import AllChem, rdMolTransforms
+from rdkit import Geometry
+import networkx as nx
+from scipy.optimize import differential_evolution
+RDLogger.DisableLog('rdApp.*')
+"""
+    Conformer matching routines from Torsional Diffusion
+"""
+def GetDihedral(conf, atom_idx):
+    return rdMolTransforms.GetDihedralRad(conf, atom_idx[0], atom_idx[1], atom_idx[2], atom_idx[3])
+def SetDihedral(conf, atom_idx, new_vale):
+    rdMolTransforms.SetDihedralRad(conf, atom_idx[0], atom_idx[1], atom_idx[2], atom_idx[3], new_vale)
+def apply_changes(mol, values, rotatable_bonds, conf_id):
+    opt_mol = copy.copy(mol)
+    [SetDihedral(opt_mol.GetConformer(conf_id), rotatable_bonds[r], values[r]) for r in range(len(rotatable_bonds))]
+    return opt_mol
+def optimize_rotatable_bonds(mol, true_mol, rotatable_bonds, probe_id=-1, ref_id=-1, seed=0, popsize=15, maxiter=500,
+                             mutation=(0.5, 1), recombination=0.8):
+    opt = OptimizeConformer(mol, true_mol, rotatable_bonds, seed=seed, probe_id=probe_id, ref_id=ref_id)
+    max_bound = [np.pi] * len(opt.rotatable_bonds)
+    min_bound = [-np.pi] * len(opt.rotatable_bonds)
+    bounds = (min_bound, max_bound)
+    bounds = list(zip(bounds[0], bounds[1]))
+    # Optimize conformations
+    result = differential_evolution(opt.score_conformation, bounds,
+                                    maxiter=maxiter, popsize=popsize,
+                                    mutation=mutation, recombination=recombination, disp=False, seed=seed)
+    opt_mol = apply_changes(opt.mol, result['x'], opt.rotatable_bonds, conf_id=probe_id)
+    return opt_mol
+class OptimizeConformer:
+    def __init__(self, mol, true_mol, rotatable_bonds, probe_id=-1, ref_id=-1, seed=None):
+        super(OptimizeConformer, self).__init__()
+        if seed:
+            np.random.seed(seed)
+        self.rotatable_bonds = rotatable_bonds
+        self.mol = mol
+        self.true_mol = true_mol
+        self.probe_id = probe_id
+        self.ref_id = ref_id
+    def score_conformation(self, values):
+        for i, r in enumerate(self.rotatable_bonds):
+            SetDihedral(self.mol.GetConformer(self.probe_id), r, values[i])
+        return AllChem.AlignMol(self.mol, self.true_mol, self.probe_id, self.ref_id)
+def get_torsion_angles(mol):
+    torsions_list = []
+    G = nx.Graph()
+    for i, atom in enumerate(mol.GetAtoms()):
+        G.add_node(i)
+    nodes = set(G.nodes())
+    for bond in mol.GetBonds():
+        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        G.add_edge(start, end)
+    for e in G.edges():
+        G2 = copy.deepcopy(G)
+        G2.remove_edge(*e)
+        if nx.is_connected(G2): continue
+        l = list(sorted(nx.connected_components(G2), key=len)[0])
+        if len(l) < 2: continue
+        n0 = list(G2.neighbors(e[0]))
+        n1 = list(G2.neighbors(e[1]))
+        torsions_list.append(
+            (n0[0], e[0], e[1], n1[0])
+        )
+    return torsions_list

datasets/constants.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Significant contribution from Ben Fry and Nick Polizzi
+three_to_one = {'ALA':	'A',
+                'ARG':	'R',
+                'ASN':	'N',
+                'ASP':	'D',
+                'CYS':	'C',
+                'GLN':	'Q',
+                'GLU':	'E',
+                'GLY':	'G',
+                'HIS':	'H',
+                'ILE':	'I',
+                'LEU':	'L',
+                'LYS':	'K',
+                'MET':	'M',
+                'MSE':  'M', # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
+                'PHE':	'F',
+                'PRO':	'P',
+                'PYL':	'O',
+                'SER':	'S',
+                'SEC':	'U',
+                'THR':	'T',
+                'TRP':	'W',
+                'TYR':	'Y',
+                'VAL':	'V',
+                'ASX':	'B',
+                'GLX':	'Z',
+                'XAA':	'X',
+                'XLE':	'J'}
+aa_name2aa_idx = {'ALA': 0, 'ARG': 1, 'ASN': 2, 'ASP': 3, 'CYS': 4, 'GLU': 5, 'GLN': 6, 'GLY': 7,
+                  'HIS': 8, 'ILE': 9, 'LEU': 10, 'LYS': 11, 'MET': 12, 'PHE': 13, 'PRO': 14,
+                  'SER': 15, 'THR': 16, 'TRP': 17, 'TYR': 18, 'VAL': 19, 'MSE': 12}
+aa_short2long = {'C': 'CYS', 'D': 'ASP', 'S': 'SER', 'Q': 'GLN', 'K': 'LYS', 'I': 'ILE',
+                 'P': 'PRO', 'T': 'THR', 'F': 'PHE', 'N': 'ASN', 'G': 'GLY', 'H': 'HIS',
+                 'L': 'LEU', 'R': 'ARG', 'W': 'TRP', 'A': 'ALA', 'V': 'VAL', 'E': 'GLU',
+                 'Y': 'TYR', 'M': 'MET'}
+aa_short2aa_idx = {aa_short: aa_name2aa_idx[aa_long] for aa_short, aa_long in aa_short2long.items()}
+aa_idx2aa_short = {aa_idx: aa_short for aa_short, aa_idx in aa_short2aa_idx.items()}
+aa_long2short = {aa_long: aa_short for aa_short, aa_long in aa_short2long.items()}
+aa_long2short['MSE'] = 'M'
+chi = { 'C' :
+        { 1: ('N'  , 'CA' , 'CB' , 'SG' )   },
+        'D' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'OD1'), },
+        'E' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD' ),
+          3: ('CB' , 'CG' , 'CD' , 'OE1'), },
+        'F' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD1'), },
+        'H' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'ND1'), },
+        'I' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG1'),
+          2: ('CA' , 'CB' , 'CG1', 'CD1'), },
+        'K' :
+        { 1: ('N'  , 'CA' , 'CB'  ,'CG' ),
+          2: ('CA' , 'CB' , 'CG'  ,'CD' ),
+          3: ('CB' , 'CG' , 'CD'  ,'CE' ),
+          4: ('CG' , 'CD' , 'CE'  ,'NZ' ), },
+        'L' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD1'), },
+        'M' :
+        { 1: ('N'  , 'CA' , 'CB'  ,'CG' ),
+          2: ('CA' , 'CB' , 'CG'  ,'SD' ),
+          3: ('CB' , 'CG' , 'SD'  ,'CE' ), },
+        'N' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'OD1'), },
+        'P' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD' ), },
+        'Q' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD' ),
+          3: ('CB' , 'CG' , 'CD' , 'OE1'), },
+        'R' :
+        { 1: ('N'  , 'CA' , 'CB'  ,'CG' ),
+          2: ('CA' , 'CB' , 'CG'  ,'CD' ),
+          3: ('CB' , 'CG' , 'CD'  ,'NE' ),
+          4: ('CG' , 'CD' , 'NE'  ,'CZ' ), },
+        'S' :
+        { 1: ('N'  , 'CA' , 'CB' , 'OG' ), },
+        'T' :
+        { 1: ('N'  , 'CA' , 'CB' , 'OG1'), },
+        'V' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG1'), },
+        'W' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD1'), },
+        'Y' :
+        { 1: ('N'  , 'CA' , 'CB' , 'CG' ),
+          2: ('CA' , 'CB' , 'CG' , 'CD1'), },
+        }
+atom_order = {'G': ['N', 'CA', 'C', 'O'],
+'A': ['N', 'CA', 'C', 'O', 'CB'],
+'S': ['N', 'CA', 'C', 'O', 'CB', 'OG'],
+'C': ['N', 'CA', 'C', 'O', 'CB', 'SG'],
+'T': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2'],
+'P': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD'],
+'V': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2'],
+'M': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE'],
+'N': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2'],
+'I': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1'],
+'L': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2'],
+'D': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2'],
+'E': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2'],
+'K': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ'],
+'Q': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2'],
+'H': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2'],
+'F': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ'],
+'R': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2'],
+'Y': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH'],
+'W': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'NE1', 'CZ2', 'CZ3', 'CH2'],
+'X': ['N', 'CA', 'C', 'O']}     # unknown amino acid
+amino_acid_smiles = {
+    'PHE': '[NH3+]CC(=O)N[C@@H](Cc1ccccc1)C(=O)NCC(=O)O',
+    'MET': 'CSCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'TYR': '[NH3+]CC(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)NCC(=O)O',
+    'ILE': 'CC[C@H](C)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'TRP': '[NH3+]CC(=O)N[C@@H](Cc1c[nH]c2ccccc12)C(=O)NCC(=O)O',
+    'THR': 'C[C@@H](O)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'CYS': '[NH3+]CC(=O)N[C@@H](CS)C(=O)NCC(=O)O',
+    'ALA': 'C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'LYS': '[NH3+]CCCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'PRO': '[NH3+]CC(=O)N1CCC[C@H]1C(=O)NCC(=O)O',
+    'LEU': 'CC(C)C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'GLY': '[NH3+]CC(=O)NCC(=O)NCC(=O)O',
+    'ASP': '[NH3+]CC(=O)N[C@@H](CC(=O)O)C(=O)NCC(=O)O',
+    'HIS': '[NH3+]CC(=O)N[C@@H](Cc1c[nH]c[nH+]1)C(=O)NCC(=O)O',
+    'VAL': 'CC(C)[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'SER': '[NH3+]CC(=O)N[C@@H](CO)C(=O)NCC(=O)O',
+    'ARG': 'NC(=[NH2+])NCCC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'GLU': '[NH3+]CC(=O)N[C@@H](CCC(=O)O)C(=O)NCC(=O)O',
+    'GLN': 'NC(=O)CC[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+    'ASN': 'NC(=O)C[C@H](NC(=O)C[NH3+])C(=O)NCC(=O)O',
+ }
+cg_rdkit_indices = {
+    'PHE': {4: 'N', 5: 'CA', 13: 'C', 14: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 12: 'CD2', 9: 'CE1', 11: 'CE2', 10: 'CZ'},
+    'MET': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 2: 'CG', 1: 'SD', 0: 'CE'},
+    'TYR': {4: 'N', 5: 'CA', 14: 'C', 15: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 13: 'CD2', 9: 'CE1', 12: 'CE2', 10: 'CZ', 11: 'OH'},
+    'ILE': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 2: 'CB', 1: 'CG1', 3: 'CG2', 0: 'CD1'},
+    'TRP': {4: 'N', 5: 'CA', 16: 'C', 17: 'O', 6: 'CB', 7: 'CG', 8: 'CD1', 15: 'CD2', 9: 'NE1', 10: 'CE2', 14: 'CE3', 11: 'CZ2', 13: 'CZ3', 12: 'CH2'},
+    'THR': {4: 'N', 3: 'CA', 9: 'C', 10: 'O', 1: 'CB', 2: 'OG1', 0: 'CG2'},
+    'CYS': {4: 'N', 5: 'CA', 8: 'C', 9: 'O', 6: 'CB', 7: 'SG'},
+    'ALA': {2: 'N', 1: 'CA', 7: 'C', 8: 'O', 0: 'CB'},
+    'LYS': {6: 'N', 5: 'CA', 11: 'C', 12: 'O', 4: 'CB', 3: 'CG', 2: 'CD', 1: 'CE', 0: 'NZ'},
+    'PRO': {4: 'N', 8: 'CA', 9: 'C', 10: 'O', 7: 'CB', 6: 'CG', 5: 'CD'},
+    'LEU': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 1: 'CG', 0: 'CD1', 2: 'CD2'},
+    'GLY': {4: 'N', 5: 'CA', 6: 'C', 7: 'O'},
+    'ASP': {4: 'N', 5: 'CA', 10: 'C', 11: 'O', 6: 'CB', 7: 'CG', 8: 'OD1', 9: 'OD2'},
+    'HIS': {4: 'N', 5: 'CA', 12: 'C', 13: 'O', 6: 'CB', 7: 'CG', 11: 'ND1', 8: 'CD2', 10: 'CE1', 9: 'NE2'},
+    'VAL': {4: 'N', 3: 'CA', 9: 'C', 10: 'O', 1: 'CB', 0: 'CG1', 2: 'CG2'},
+    'SER': {4: 'N', 5: 'CA', 8: 'C', 9: 'O', 6: 'CB', 7: 'OG'},
+    'ARG': {8: 'N', 7: 'CA', 13: 'C', 14: 'O', 6: 'CB', 5: 'CG', 4: 'CD', 3: 'NE', 1: 'CZ', 0: 'NH1', 2: 'NH2'},
+    'GLU': {4: 'N', 5: 'CA', 11: 'C', 12: 'O', 6: 'CB', 7: 'CG', 8: 'CD', 9: 'OE1', 10: 'OE2'},
+    'GLN': {6: 'N', 5: 'CA', 11: 'C', 12: 'O', 4: 'CB', 3: 'CG', 1: 'CD', 2: 'OE1', 0: 'NE2'},
+    'ASN': {5: 'N', 4: 'CA', 10: 'C', 11: 'O', 3: 'CB', 1: 'CG', 2: 'OD1', 0: 'ND2'}
+}
+aa_to_cg_indices = {aa_long2short[x]: [atom_order[aa_long2short[x]].index(aname) for aname in index_dict.values()]  for x, index_dict in cg_rdkit_indices.items()}

datasets/dataloader.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from collections.abc import Mapping, Sequence
+from typing import List, Optional, Union
+import torch.utils.data
+from torch.utils.data.dataloader import default_collate
+from torch_geometric.data import Batch, Dataset
+from torch_geometric.data.data import BaseData
+class Collater:
+    def __init__(self, follow_batch, exclude_keys):
+        self.follow_batch = follow_batch
+        self.exclude_keys = exclude_keys
+    def __call__(self, batch):
+        batch = [x for x in batch if x is not None]
+        elem = batch[0]
+        if isinstance(elem, BaseData):
+            return Batch.from_data_list(batch, self.follow_batch,
+                                        self.exclude_keys)
+        elif isinstance(elem, torch.Tensor):
+            return default_collate(batch)
+        elif isinstance(elem, float):
+            return torch.tensor(batch, dtype=torch.float)
+        elif isinstance(elem, int):
+            return torch.tensor(batch)
+        elif isinstance(elem, str):
+            return batch
+        elif isinstance(elem, Mapping):
+            return {key: self([data[key] for data in batch]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, '_fields'):
+            return type(elem)(*(self(s) for s in zip(*batch)))
+        elif isinstance(elem, Sequence) and not isinstance(elem, str):
+            return [self(s) for s in zip(*batch)]
+        raise TypeError(f'DataLoader found invalid type: {type(elem)}')
+    def collate(self, batch):  # Deprecated...
+        return self(batch)
+class DataLoader(torch.utils.data.DataLoader):
+    r"""A data loader which merges data objects from a
+    :class:`torch_geometric.data.Dataset` to a mini-batch.
+    Data objects can be either of type :class:`~torch_geometric.data.Data` or
+    :class:`~torch_geometric.data.HeteroData`.
+    Args:
+        dataset (Dataset): The dataset from which to load the data.
+        batch_size (int, optional): How many samples per batch to load.
+            (default: :obj:`1`)
+        shuffle (bool, optional): If set to :obj:`True`, the data will be
+            reshuffled at every epoch. (default: :obj:`False`)
+        follow_batch (List[str], optional): Creates assignment batch
+            vectors for each key in the list. (default: :obj:`None`)
+        exclude_keys (List[str], optional): Will exclude each key in the
+            list. (default: :obj:`None`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch.utils.data.DataLoader`.
+    """
+    def __init__(
+        self,
+        dataset: Union[Dataset, List[BaseData]],
+        batch_size: int = 1,
+        shuffle: bool = False,
+        follow_batch: Optional[List[str]] = None,
+        exclude_keys: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if 'collate_fn' in kwargs:
+            del kwargs['collate_fn']
+        # Save for PyTorch Lightning:
+        self.follow_batch = follow_batch
+        self.exclude_keys = exclude_keys
+        super().__init__(
+            dataset,
+            batch_size,
+            shuffle,
+            collate_fn=Collater(follow_batch, exclude_keys),
+            **kwargs,
+        )
+def collate_fn(data_list):
+    data_list = [x for x in data_list if x is not None]
+    return data_list
+class DataListLoader(torch.utils.data.DataLoader):
+    def __init__(self, dataset: Union[Dataset, List[BaseData]],
+                 batch_size: int = 1, shuffle: bool = False, **kwargs):
+        if 'collate_fn' in kwargs:
+            del kwargs['collate_fn']
+        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle,
+                         collate_fn=collate_fn, **kwargs)

datasets/esm_embedding_preparation.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import pickle
+from argparse import ArgumentParser
+from Bio.PDB import PDBParser
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from tqdm import tqdm
+from Bio import SeqIO
+from datasets.constants import three_to_one
+parser = ArgumentParser()
+parser.add_argument('--out_file', type=str, default="data/prepared_for_esm.fasta")
+parser.add_argument('--dataset', type=str, default="pdbbind")
+parser.add_argument('--data_dir', type=str, default='../data/BindingMOAD_2020_ab_processed_biounit/pdb_protein/', help='')
+args = parser.parse_args()
+biopython_parser = PDBParser()
+def get_structure_from_file(file_path):
+    structure = biopython_parser.get_structure('random_id', file_path)
+    structure = structure[0]
+    l = []
+    for i, chain in enumerate(structure):
+        seq = ''
+        for res_idx, residue in enumerate(chain):
+            if residue.get_resname() == 'HOH':
+                continue
+            residue_coords = []
+            c_alpha, n, c = None, None, None
+            for atom in residue:
+                if atom.name == 'CA':
+                    c_alpha = list(atom.get_vector())
+                if atom.name == 'N':
+                    n = list(atom.get_vector())
+                if atom.name == 'C':
+                    c = list(atom.get_vector())
+            if c_alpha != None and n != None and c != None:  # only append residue if it is an amino acid
+                try:
+                    seq += three_to_one[residue.get_resname()]
+                except Exception as e:
+                    seq += '-'
+                    print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', file_path, '. Replacing it with a dash - .')
+        l.append(seq)
+    return l
+data_dir = args.data_dir
+names = os.listdir(data_dir)
+if args.dataset == 'pdbbind':
+    sequences = []
+    ids = []
+    for name in tqdm(names):
+        if name == '.DS_Store': continue
+        if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')):
+            rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb')
+        else:
+            rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
+        l = get_structure_from_file(rec_path)
+        for i, seq in enumerate(l):
+            sequences.append(seq)
+            ids.append(f'{name}_chain_{i}')
+    records = []
+    for (index, seq) in zip(ids, sequences):
+        record = SeqRecord(Seq(seq), str(index))
+        record.description = ''
+        records.append(record)
+    SeqIO.write(records, args.out_file, "fasta")
+elif args.dataset == 'moad':
+    names = [n[:6] for n in names]
+    name_to_sequence = {}
+    for name in tqdm(names):
+        if name == '.DS_Store': continue
+        if not os.path.exists(os.path.join(data_dir, f'{name}_protein.pdb')):
+            print(f"We are skipping {name} because there was no {name}_protein.pdb")
+            continue
+        rec_path = os.path.join(data_dir, f'{name}_protein.pdb')
+        l = get_structure_from_file(rec_path)
+        for i, seq in enumerate(l):
+            name_to_sequence[name + '_chain_' + str(i)] = seq
+    # save to file
+    with open(args.out_file, 'wb') as f:
+        pickle.dump(name_to_sequence, f)

datasets/esm_embeddings_to_pt.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+from argparse import ArgumentParser
+import torch
+from tqdm import tqdm
+parser = ArgumentParser()
+parser.add_argument('--esm_embeddings_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new', help='')
+parser.add_argument('--output_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new.pt', help='')
+args = parser.parse_args()
+dict = {}
+for filename in tqdm(os.listdir(args.esm_embeddings_path)):
+    dict[filename.split('.')[0]] = torch.load(os.path.join(args.esm_embeddings_path,filename))['representations'][33]
+torch.save(dict,args.output_path)

datasets/loader.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import torch
+from torch_geometric.data import Dataset
+from datasets.dataloader import DataLoader, DataListLoader
+from datasets.moad import MOAD
+from datasets.pdb import PDBSidechain
+from datasets.pdbbind import NoiseTransform, PDBBind
+from utils.utils import read_strings_from_txt
+class CombineDatasets(Dataset):
+    def __init__(self, dataset1, dataset2):
+        super(CombineDatasets, self).__init__()
+        self.dataset1 = dataset1
+        self.dataset2 = dataset2
+    def len(self):
+        return len(self.dataset1) + len(self.dataset2)
+    def get(self, idx):
+        if idx < len(self.dataset1):
+            return self.dataset1[idx]
+        else:
+            return self.dataset2[idx - len(self.dataset1)]
+    def add_complexes(self, new_complex_list):
+        self.dataset1.add_complexes(new_complex_list)
+def construct_loader(args, t_to_sigma, device):
+    val_dataset2 = None
+    transform = NoiseTransform(t_to_sigma=t_to_sigma, no_torsion=args.no_torsion,
+                               all_atom=args.all_atoms, alpha=args.sampling_alpha, beta=args.sampling_beta,
+                               include_miscellaneous_atoms=False if not hasattr(args, 'include_miscellaneous_atoms') else args.include_miscellaneous_atoms,
+                               crop_beyond_cutoff=args.crop_beyond)
+    if args.triple_training: assert args.combined_training
+    sequences_to_embeddings = None
+    if args.dataset == 'pdbsidechain' or args.triple_training:
+        if args.pdbsidechain_esm_embeddings_path is not None:
+            print('Loading ESM embeddings')
+            id_to_embeddings = torch.load(args.pdbsidechain_esm_embeddings_path)
+            sequences_list = read_strings_from_txt(args.pdbsidechain_esm_embeddings_sequences_path)
+            sequences_to_embeddings = {}
+            for i, seq in enumerate(sequences_list):
+                if str(i) in id_to_embeddings:
+                    sequences_to_embeddings[seq] = id_to_embeddings[str(i)]
+    if args.dataset == 'pdbsidechain' or args.triple_training:
+        common_args = {'root': args.pdbsidechain_dir, 'transform': transform, 'limit_complexes': args.limit_complexes,
+                       'receptor_radius': args.receptor_radius,
+                       'c_alpha_max_neighbors': args.c_alpha_max_neighbors,
+                       'remove_hs': args.remove_hs, 'num_workers': args.num_workers, 'all_atoms': args.all_atoms,
+                       'atom_radius': args.atom_radius, 'atom_max_neighbors': args.atom_max_neighbors,
+                       'knn_only_graph': not args.not_knn_only_graph, 'sequences_to_embeddings': sequences_to_embeddings,
+                       'vandermers_max_dist': args.vandermers_max_dist,
+                       'vandermers_buffer_residue_num': args.vandermers_buffer_residue_num,
+                       'vandermers_min_contacts': args.vandermers_min_contacts,
+                       'remove_second_segment': args.remove_second_segment,
+                       'merge_clusters': args.merge_clusters}
+        train_dataset3 = PDBSidechain(cache_path=args.cache_path, split='train', multiplicity=args.train_multiplicity, **common_args)
+        if args.dataset == 'pdbsidechain':
+            train_dataset = train_dataset3
+            val_dataset = PDBSidechain(cache_path=args.cache_path, split='val', multiplicity=args.val_multiplicity, **common_args)
+        loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
+    if args.dataset in ['pdbbind', 'moad', 'generalisation', 'distillation']:
+        common_args = {'transform': transform, 'limit_complexes': args.limit_complexes,
+                       'chain_cutoff': args.chain_cutoff, 'receptor_radius': args.receptor_radius,
+                       'c_alpha_max_neighbors': args.c_alpha_max_neighbors,
+                       'remove_hs': args.remove_hs, 'max_lig_size': args.max_lig_size,
+                       'matching': not args.no_torsion, 'popsize': args.matching_popsize, 'maxiter': args.matching_maxiter,
+                       'num_workers': args.num_workers, 'all_atoms': args.all_atoms,
+                       'atom_radius': args.atom_radius, 'atom_max_neighbors': args.atom_max_neighbors,
+                       'knn_only_graph': False if not hasattr(args, 'not_knn_only_graph') else not args.not_knn_only_graph,
+                       'include_miscellaneous_atoms': False if not hasattr(args, 'include_miscellaneous_atoms') else args.include_miscellaneous_atoms,
+                       'matching_tries': args.matching_tries}
+        if args.dataset == 'pdbbind' or args.dataset == 'generalisation' or args.combined_training:
+            train_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_train, keep_original=True,
+                                    num_conformers=args.num_conformers, root=args.pdbbind_dir,
+                                    esm_embeddings_path=args.pdbbind_esm_embeddings_path,
+                                    protein_file=args.protein_file, **common_args)
+        if args.dataset == 'moad' or args.combined_training:
+            train_dataset2 = MOAD(cache_path=args.cache_path, split='train', keep_original=True,
+                                  num_conformers=args.num_conformers, max_receptor_size=args.max_receptor_size,
+                                  remove_promiscuous_targets=args.remove_promiscuous_targets, min_ligand_size=args.min_ligand_size,
+                                  multiplicity= args.train_multiplicity, unroll_clusters=args.unroll_clusters,
+                                  esm_embeddings_sequences_path=args.moad_esm_embeddings_sequences_path,
+                                  root=args.moad_dir, esm_embeddings_path=args.moad_esm_embeddings_path,
+                                  enforce_timesplit=args.enforce_timesplit, **common_args)
+            if args.combined_training:
+                train_dataset = CombineDatasets(train_dataset2, train_dataset)
+                if args.triple_training:
+                    train_dataset = CombineDatasets(train_dataset, train_dataset3)
+            else:
+                train_dataset = train_dataset2
+        if args.dataset == 'pdbbind' or args.double_val:
+            val_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_val, keep_original=True,
+                                  esm_embeddings_path=args.pdbbind_esm_embeddings_path, root=args.pdbbind_dir,
+                                  protein_file=args.protein_file, require_ligand=True, **common_args)
+            if args.double_val:
+                val_dataset2 = val_dataset
+        if args.dataset == 'moad' or args.dataset == 'generalisation':
+            val_dataset = MOAD(cache_path=args.cache_path, split='val', keep_original=True,
+                               multiplicity= args.val_multiplicity, max_receptor_size=args.max_receptor_size,
+                               remove_promiscuous_targets=args.remove_promiscuous_targets, min_ligand_size=args.min_ligand_size,
+                               esm_embeddings_sequences_path=args.moad_esm_embeddings_sequences_path,
+                               unroll_clusters=args.unroll_clusters, root=args.moad_dir,
+                               esm_embeddings_path=args.moad_esm_embeddings_path, require_ligand=True, **common_args)
+        loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
+    train_loader = loader_class(dataset=train_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=True, pin_memory=args.pin_memory, drop_last=args.dataloader_drop_last)
+    val_loader = loader_class(dataset=val_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=False, pin_memory=args.pin_memory, drop_last=args.dataloader_drop_last)
+    return train_loader, val_loader, val_dataset2

datasets/moad.py ADDED Viewed

	@@ -0,0 +1,547 @@

+import os
+import pickle
+from multiprocessing import Pool
+import random
+import copy
+from torch_geometric.data import Batch
+import numpy as np
+import torch
+from prody import confProDy
+from rdkit import Chem
+from rdkit.Chem import RemoveHs
+from torch_geometric.data import Dataset, HeteroData
+from torch_geometric.utils import subgraph
+from tqdm import tqdm
+confProDy(verbosity='none')
+from datasets.process_mols import get_lig_graph_with_matching, moad_extract_receptor_structure
+from utils.utils import read_strings_from_txt
+class MOAD(Dataset):
+    def __init__(self, root, transform=None, cache_path='data/cache', split='train', limit_complexes=0, chain_cutoff=None,
+                 receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, popsize=15, maxiter=15,
+                 matching=True, keep_original=False, max_lig_size=None, remove_hs=False, num_conformers=1, all_atoms=False,
+                 atom_radius=5, atom_max_neighbors=None, esm_embeddings_path=None, esm_embeddings_sequences_path=None, require_ligand=False,
+                 include_miscellaneous_atoms=False, keep_local_structures=False,
+                 min_ligand_size=0, knn_only_graph=False, matching_tries=1, multiplicity=1,
+                 max_receptor_size=None, remove_promiscuous_targets=None, unroll_clusters=False, remove_pdbbind=False,
+                 enforce_timesplit=False, no_randomness=False, single_cluster_name=None, total_dataset_size=None, skip_matching=False):
+        super(MOAD, self).__init__(root, transform)
+        self.moad_dir = root
+        self.include_miscellaneous_atoms = include_miscellaneous_atoms
+        self.max_lig_size = max_lig_size
+        self.split = split
+        self.limit_complexes = limit_complexes
+        self.receptor_radius = receptor_radius
+        self.num_workers = num_workers
+        self.c_alpha_max_neighbors = c_alpha_max_neighbors
+        self.remove_hs = remove_hs
+        self.require_ligand = require_ligand
+        self.esm_embeddings_path = esm_embeddings_path
+        self.esm_embeddings_sequences_path = esm_embeddings_sequences_path
+        self.keep_local_structures = keep_local_structures
+        self.knn_only_graph = knn_only_graph
+        self.matching_tries = matching_tries
+        self.all_atoms = all_atoms
+        self.multiplicity = multiplicity
+        self.chain_cutoff = chain_cutoff
+        self.no_randomness = no_randomness
+        self.total_dataset_size = total_dataset_size
+        self.skip_matching = skip_matching
+        self.prot_cache_path = os.path.join(cache_path, f'MOAD12_limit{self.limit_complexes}_INDEX{self.split}'
+                                                        f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
+                                            + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
+                                            + ('' if self.esm_embeddings_path is None else f'_esmEmbeddings')
+                                            + ('' if not self.include_miscellaneous_atoms else '_miscAtoms')
+                                            + ('' if not self.knn_only_graph else '_knnOnly'))
+        self.lig_cache_path = os.path.join(cache_path, f'MOAD12_limit{self.limit_complexes}_INDEX{self.split}'
+                                                        f'_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}'
+                                            + ('' if not matching else f'_matching')
+                                            + ('' if not skip_matching else f'skip')
+                                            + (''if not matching or num_conformers == 1 else f'_confs{num_conformers}')
+                                            + ('' if not keep_local_structures else f'_keptLocalStruct')
+                                            + ('' if self.matching_tries == 1 else f'_tries{matching_tries}'))
+        self.popsize, self.maxiter = popsize, maxiter
+        self.matching, self.keep_original = matching, keep_original
+        self.num_conformers = num_conformers
+        self.single_cluster_name = single_cluster_name
+        if split == 'train':
+            split = 'PDBBind'
+        with open("./data/splits/MOAD_generalisation_splits.pkl", "rb") as f:
+            self.split_clusters = pickle.load(f)[split]
+        clustes_path = os.path.join(self.moad_dir, "new_cluster_to_ligands.pkl")
+        with open(clustes_path, "rb") as f:
+            self.cluster_to_ligands = pickle.load(f)
+            #self.cluster_to_ligands = {k: [s.split('.')[0] for s in v] for k, v in self.cluster_to_ligands.items()}
+        self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
+        if not self.check_all_receptors():
+            os.makedirs(self.prot_cache_path, exist_ok=True)
+            self.preprocessing_receptors()
+        self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
+        if not os.path.exists(os.path.join(self.lig_cache_path, "ligands.pkl")):
+            os.makedirs(self.lig_cache_path, exist_ok=True)
+            self.preprocessing_ligands()
+        print('loading ligands from memory: ', os.path.join(self.lig_cache_path, "ligands.pkl"))
+        with open(os.path.join(self.lig_cache_path, "ligands.pkl"), 'rb') as f:
+            self.ligands = pickle.load(f)
+        if require_ligand:
+            with open(os.path.join(self.lig_cache_path, "rdkit_ligands.pkl"), 'rb') as f:
+                self.rdkit_ligands = pickle.load(f)
+                self.rdkit_ligands = {lig.name:mol for mol, lig in zip(self.rdkit_ligands, self.ligands)}
+        len_before = len(self.ligands)
+        if not self.single_cluster_name is None:
+            self.ligands = [lig for lig in self.ligands if lig.name in self.cluster_to_ligands[self.single_cluster_name]]
+        print('Kept', len(self.ligands), f'ligands in {self.single_cluster_name} out of', len_before)
+        len_before = len(self.ligands)
+        self.ligands = {lig.name: lig for lig in self.ligands if min_ligand_size == 0 or lig['ligand'].x.shape[0] >= min_ligand_size}
+        print('removed', len_before - len(self.ligands), 'ligands below minimum size out of', len_before)
+        receptors_names = set([lig.name[:6] for lig in self.ligands.values()])
+        self.collect_receptors(receptors_names, max_receptor_size, remove_promiscuous_targets)
+        # filter ligands for which the receptor failed
+        tot_before = len(self.ligands)
+        self.ligands = {k:v for k, v in self.ligands.items() if k[:6] in self.receptors}
+        print('removed', tot_before - len(self.ligands), 'ligands with no receptor out of', tot_before)
+        if remove_pdbbind:
+            complexes_pdbbind = read_strings_from_txt('data/splits/timesplit_no_lig_overlap_train') + read_strings_from_txt('data/splits/timesplit_no_lig_overlap_val')
+            with open('data/BindingMOAD_2020_ab_processed_biounit/ecod_t_group_binding_site_assignment_dict_major_domain.pkl', 'rb') as f:
+                pdbbind_to_cluster = pickle.load(f)
+            clusters_pdbbind = set([pdbbind_to_cluster[c] for c in complexes_pdbbind])
+            self.split_clusters = [c for c in self.split_clusters if c not in clusters_pdbbind]
+            self.cluster_to_ligands = {k: v for k, v in self.cluster_to_ligands.items() if k not in clusters_pdbbind}
+            ligand_accepted = []
+            for c, ligands in self.cluster_to_ligands.items():
+                ligand_accepted += ligands
+            ligand_accepted = set(ligand_accepted)
+            tot_before = len(self.ligands)
+            self.ligands = {k: v for k, v in self.ligands.items() if k in ligand_accepted}
+            print('removed', tot_before - len(self.ligands), 'ligands in overlap with PDBBind out of', tot_before)
+        if enforce_timesplit:
+            with open("data/splits/pdbids_2019", "r") as f:
+                lines = f.readlines()
+            pdbids_from2019 = []
+            for i in range(6, len(lines), 4):
+                pdbids_from2019.append(lines[i][18:22])
+            pdbids_from2019 = set(pdbids_from2019)
+            len_before = len(self.ligands)
+            self.ligands = {k: v for k, v in self.ligands.items() if k[:4].upper() not in pdbids_from2019}
+            print('removed', len_before - len(self.ligands), 'ligands from 2019 out of', len_before)
+        if unroll_clusters:
+            rec_keys = set([k[:6] for k in self.ligands.keys()])
+            self.cluster_to_ligands = {k:[k2 for k2 in self.ligands.keys() if k2[:6] == k] for k in rec_keys}
+            self.split_clusters = list(rec_keys)
+        else:
+            for c in self.cluster_to_ligands.keys():
+                 self.cluster_to_ligands[c] = [v for v in self.cluster_to_ligands[c] if v in self.ligands]
+            self.split_clusters = [c for c in self.split_clusters if len(self.cluster_to_ligands[c])>0]
+        print_statistics(self)
+        list_names = [name for cluster in self.split_clusters for name in self.cluster_to_ligands[cluster]]
+        with open(os.path.join(self.prot_cache_path, f'moad_{self.split}_names.txt'), 'w') as f:
+            f.write('\n'.join(list_names))
+    def len(self):
+        return len(self.split_clusters) * self.multiplicity if self.total_dataset_size is None else self.total_dataset_size
+    def get_by_name(self, ligand_name, cluster):
+        ligand_graph = copy.deepcopy(self.ligands[ligand_name])
+        complex_graph = copy.deepcopy(self.receptors[ligand_name[:6]])
+        if False and self.keep_original and hasattr(ligand_graph['ligand'], 'orig_pos'):
+            lig_path = os.path.join(self.moad_dir, 'pdb_superligand', ligand_name + '.pdb')
+            lig = Chem.MolFromPDBFile(lig_path)
+            formula = np.asarray([atom.GetSymbol() for atom in lig.GetAtoms()])
+            # check for same receptor/ligand pair with a different binding position
+            for ligand_comp in self.cluster_to_ligands[cluster]:
+                if ligand_comp == ligand_name or ligand_comp[:6] != ligand_name[:6]:
+                    continue
+                lig_path_comp = os.path.join(self.moad_dir, 'pdb_superligand', ligand_comp + '.pdb')
+                if not os.path.exists(lig_path_comp):
+                    continue
+                lig_comp = Chem.MolFromPDBFile(lig_path_comp)
+                formula_comp = np.asarray([atom.GetSymbol() for atom in lig_comp.GetAtoms()])
+                if formula.shape == formula_comp.shape and np.all(formula == formula_comp) and hasattr(
+                        self.ligands[ligand_comp], 'orig_pos'):
+                    print(f'Found complex {ligand_comp} to have the same complex/ligand pair, adding it into orig_pos')
+                    # add the orig_pos of the binding position
+                    if not isinstance(ligand_graph['ligand'].orig_pos, list):
+                        ligand_graph['ligand'].orig_pos = [ligand_graph['ligand'].orig_pos]
+                    ligand_graph['ligand'].orig_pos.append(self.ligands[ligand_comp].orig_pos)
+        for type in ligand_graph.node_types + ligand_graph.edge_types:
+            for key, value in ligand_graph[type].items():
+                complex_graph[type][key] = value
+        complex_graph.name = ligand_graph.name
+        if isinstance(complex_graph['ligand'].pos, list):
+            for i in range(len(complex_graph['ligand'].pos)):
+                complex_graph['ligand'].pos[i] -= complex_graph.original_center
+        else:
+            complex_graph['ligand'].pos -= complex_graph.original_center
+        if self.require_ligand:
+            complex_graph.mol = copy.deepcopy(self.rdkit_ligands[ligand_name])
+        if self.chain_cutoff:
+            distances = torch.norm(
+                (torch.from_numpy(complex_graph['ligand'].orig_pos[0]) - complex_graph.original_center).unsqueeze(1) - complex_graph['receptor'].pos.unsqueeze(0), dim=2)
+            distances = distances.min(dim=0)[0]
+            if torch.min(distances) >= self.chain_cutoff:
+                print('minimum distance', torch.min(distances), 'too large', ligand_name,
+                      'skipping and returning random. Number of chains',
+                      torch.max(complex_graph['receptor'].chain_ids) + 1)
+                return self.get(random.randint(0, self.len()))
+            within_cutoff = distances < self.chain_cutoff
+            chains_within_cutoff = torch.zeros(torch.max(complex_graph['receptor'].chain_ids) + 1)
+            chains_within_cutoff.index_add_(0, complex_graph['receptor'].chain_ids, within_cutoff.float())
+            chains_within_cutoff_bool = chains_within_cutoff > 0
+            residues_to_keep = chains_within_cutoff_bool[complex_graph['receptor'].chain_ids]
+            if self.all_atoms:
+                atom_to_res_mapping = complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index[1]
+                atoms_to_keep = residues_to_keep[atom_to_res_mapping]
+                rec_remapper = (torch.cumsum(residues_to_keep.long(), dim=0) - 1)
+                atom_to_res_new_mapping = rec_remapper[atom_to_res_mapping][atoms_to_keep]
+                atom_res_edge_index = torch.stack([torch.arange(len(atom_to_res_new_mapping)), atom_to_res_new_mapping])
+                complex_graph['atom'].x = complex_graph['atom'].x[atoms_to_keep]
+                complex_graph['atom'].pos = complex_graph['atom'].pos[atoms_to_keep]
+                complex_graph['atom', 'atom_contact', 'atom'].edge_index = \
+                    subgraph(atoms_to_keep, complex_graph['atom', 'atom_contact', 'atom'].edge_index,
+                             relabel_nodes=True)[0]
+                complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index = atom_res_edge_index
+            complex_graph['receptor'].pos = complex_graph['receptor'].pos[residues_to_keep]
+            complex_graph['receptor'].x = complex_graph['receptor'].x[residues_to_keep]
+            complex_graph['receptor'].side_chain_vecs = complex_graph['receptor'].side_chain_vecs[residues_to_keep]
+            complex_graph['receptor', 'rec_contact', 'receptor'].edge_index = \
+            subgraph(residues_to_keep, complex_graph['receptor', 'rec_contact', 'receptor'].edge_index,
+                     relabel_nodes=True)[0]
+            extra_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
+            complex_graph['receptor'].pos -= extra_center
+            if isinstance(complex_graph['ligand'].pos, list):
+                for i in range(len(complex_graph['ligand'].pos)):
+                    complex_graph['ligand'].pos[i] -= extra_center
+            else:
+                complex_graph['ligand'].pos -= extra_center
+            complex_graph.original_center += extra_center
+        complex_graph['receptor'].pop('chain_ids')
+        for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq',
+                  'to_keep', 'chain_ids']:
+            if hasattr(complex_graph, a):
+                delattr(complex_graph, a)
+            if hasattr(complex_graph['receptor'], a):
+                delattr(complex_graph['receptor'], a)
+        return complex_graph
+    def get(self, idx):
+        if self.total_dataset_size is not None:
+            idx = random.randint(0, len(self.split_clusters) - 1)
+        idx = idx % len(self.split_clusters)
+        cluster = self.split_clusters[idx]
+        if self.no_randomness:
+            ligand_name = sorted(self.cluster_to_ligands[cluster])[0]
+        else:
+            ligand_name = random.choice(self.cluster_to_ligands[cluster])
+        complex_graph = self.get_by_name(ligand_name, cluster)
+        if self.total_dataset_size is not None:
+            complex_graph = Batch.from_data_list([complex_graph])
+        return complex_graph
+    def get_all_complexes(self):
+        complexes = {}
+        for cluster in self.split_clusters:
+            for ligand_name in self.cluster_to_ligands[cluster]:
+                complexes[ligand_name] = self.get_by_name(ligand_name, cluster)
+        return complexes
+    def preprocessing_receptors(self):
+        print(f'Processing receptors from [{self.split}] and saving it to [{self.prot_cache_path}]')
+        complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        receptor_names_all = [l[:6] for l in complex_names_all]
+        receptor_names_all = sorted(list(dict.fromkeys(receptor_names_all)))
+        print(f'Loading {len(receptor_names_all)} receptors.')
+        if self.esm_embeddings_path is not None:
+            id_to_embeddings = torch.load(self.esm_embeddings_path)
+            sequences_list = read_strings_from_txt(self.esm_embeddings_sequences_path)
+            sequences_to_embeddings = {}
+            for i, seq in enumerate(sequences_list):
+                sequences_to_embeddings[seq] = id_to_embeddings[str(i)]
+        else:
+            sequences_to_embeddings = None
+        # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+        list_indices = list(range(len(receptor_names_all)//1000+1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            if os.path.exists(os.path.join(self.prot_cache_path, f"receptors{i}.pkl")):
+                continue
+            receptor_names = receptor_names_all[1000*i:1000*(i+1)]
+            receptor_graphs = []
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(receptor_names), desc=f'loading receptors {i}/{len(receptor_names_all)//1000+1}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                for t in map_fn(self.get_receptor, zip(receptor_names, [sequences_to_embeddings]*len(receptor_names))):
+                    if t is not None:
+                        print(len(receptor_graphs))
+                        receptor_graphs.append(t)
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            print('Number of receptors: ', len(receptor_graphs))
+            with open(os.path.join(self.prot_cache_path, f"receptors{i}.pkl"), 'wb') as f:
+                pickle.dump((receptor_graphs), f)
+        return receptor_names_all
+    def check_all_receptors(self):
+        complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        receptor_names_all = [l[:6] for l in complex_names_all]
+        receptor_names_all = list(dict.fromkeys(receptor_names_all))
+        for i in range(len(receptor_names_all)//1000+1):
+            if not os.path.exists(os.path.join(self.prot_cache_path, f"receptors{i}.pkl")):
+                return False
+        return True
+    def collect_receptors(self, receptors_to_keep=None, max_receptor_size=None, remove_promiscuous_targets=None):
+        complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        receptor_names_all = [l[:6] for l in complex_names_all]
+        receptor_names_all = sorted(list(dict.fromkeys(receptor_names_all)))
+        receptor_graphs_all = []
+        total_recovered = 0
+        print(f'Loading {len(receptor_names_all)} receptors to keep {len(receptors_to_keep)}.')
+        for i in range(len(receptor_names_all)//1000+1):
+            print(f'prot path: {os.path.join(self.prot_cache_path, f"receptors{i}.pkl")}')
+            with open(os.path.join(self.prot_cache_path, f"receptors{i}.pkl"), 'rb') as f:
+                l = pickle.load(f)
+                total_recovered += len(l)
+                if receptors_to_keep is not None:
+                    l = [t for t in l if t['receptor_name'] in receptors_to_keep]
+                receptor_graphs_all.extend(l)
+        cur_len = len(receptor_graphs_all)
+        print(f"Kept {len(receptor_graphs_all)} receptors out of {len(receptor_names_all)} total and recovered {total_recovered}")
+        if max_receptor_size is not None:
+            receptor_graphs_all = [rec for rec in receptor_graphs_all if rec["receptor"].pos.shape[0] <= max_receptor_size]
+            print(f"Kept {len(receptor_graphs_all)} receptors out of {cur_len} after filtering by size")
+            cur_len = len(receptor_graphs_all)
+        if remove_promiscuous_targets is not None:
+            promiscuous_targets = set()
+            for name in complex_names_all:
+                l = name.split('_')
+                if int(l[3]) > remove_promiscuous_targets:
+                    promiscuous_targets.add(name[:6])
+            receptor_graphs_all = [rec for rec in receptor_graphs_all if rec["receptor_name"] not in promiscuous_targets]
+            print(f"Kept {len(receptor_graphs_all)} receptors out of {cur_len} after removing promiscuous targets")
+        self.receptors = {}
+        for r in receptor_graphs_all:
+            self.receptors[r['receptor_name']] = r
+        return
+    def get_receptor(self, par):
+        name, sequences_to_embeddings = par
+        rec_path = os.path.join(self.moad_dir, 'pdb_protein', name + '_protein.pdb')
+        if not os.path.exists(rec_path):
+            print("Receptor not found", name, rec_path)
+            return None
+        complex_graph = HeteroData()
+        complex_graph['receptor_name'] = name
+        try:
+            moad_extract_receptor_structure(path=rec_path, complex_graph=complex_graph, neighbor_cutoff=self.receptor_radius,
+                                            max_neighbors=self.c_alpha_max_neighbors, sequences_to_embeddings=sequences_to_embeddings,
+                                            knn_only_graph=self.knn_only_graph, all_atoms=self.all_atoms, atom_cutoff=self.atom_radius,
+                                            atom_max_neighbors=self.atom_max_neighbors)
+        except Exception as e:
+            print(f'Skipping {name} because of the error:')
+            print(e)
+            return None
+        protein_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
+        complex_graph['receptor'].pos -= protein_center
+        if self.all_atoms:
+            complex_graph['atom'].pos -= protein_center
+        complex_graph.original_center = protein_center
+        return complex_graph
+    def preprocessing_ligands(self):
+        print(f'Processing complexes from [{self.split}] and saving it to [{self.lig_cache_path}]')
+        complex_names_all = sorted([l for c in self.split_clusters for l in self.cluster_to_ligands[c]])
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        print(f'Loading {len(complex_names_all)} ligands.')
+        # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+        list_indices = list(range(len(complex_names_all)//1000+1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            if os.path.exists(os.path.join(self.lig_cache_path, f"ligands{i}.pkl")):
+                continue
+            complex_names = complex_names_all[1000*i:1000*(i+1)]
+            ligand_graphs, rdkit_ligands = [], []
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(complex_names), desc=f'loading complexes {i}/{len(complex_names_all)//1000+1}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                for t in map_fn(self.get_ligand, complex_names):
+                    if t is not None:
+                        ligand_graphs.append(t[0])
+                        rdkit_ligands.append(t[1])
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            with open(os.path.join(self.lig_cache_path, f"ligands{i}.pkl"), 'wb') as f:
+                pickle.dump((ligand_graphs), f)
+            with open(os.path.join(self.lig_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
+                pickle.dump((rdkit_ligands), f)
+        ligand_graphs_all = []
+        for i in range(len(complex_names_all)//1000+1):
+            with open(os.path.join(self.lig_cache_path, f"ligands{i}.pkl"), 'rb') as f:
+                l = pickle.load(f)
+                ligand_graphs_all.extend(l)
+        with open(os.path.join(self.lig_cache_path, f"ligands.pkl"), 'wb') as f:
+            pickle.dump((ligand_graphs_all), f)
+        rdkit_ligands_all = []
+        for i in range(len(complex_names_all) // 1000 + 1):
+            with open(os.path.join(self.lig_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
+                l = pickle.load(f)
+                rdkit_ligands_all.extend(l)
+        with open(os.path.join(self.lig_cache_path, f"rdkit_ligands.pkl"), 'wb') as f:
+            pickle.dump((rdkit_ligands_all), f)
+    def get_ligand(self, name):
+        if self.split == 'train':
+            lig_path = os.path.join(self.moad_dir, 'pdb_superligand', name + '.pdb')
+        else:
+            lig_path = os.path.join(self.moad_dir, 'pdb_ligand', name + '.pdb')
+        if not os.path.exists(lig_path):
+            print("Ligand not found", name, lig_path)
+            return None
+        # read pickle
+        lig = Chem.MolFromPDBFile(lig_path)
+        if self.max_lig_size is not None and lig.GetNumHeavyAtoms() > self.max_lig_size:
+            print(f'Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data.')
+            return None
+        try:
+            if self.matching:
+                smile = Chem.MolToSmiles(lig)
+                if '.' in smile:
+                    print(f'Ligand {name} has multiple fragments and we are doing matching. Not including {name} in preprocessed data.')
+                    return None
+            complex_graph = HeteroData()
+            complex_graph['name'] = name
+            Chem.SanitizeMol(lig)
+            get_lig_graph_with_matching(lig, complex_graph, self.popsize, self.maxiter, self.matching, self.keep_original,
+                                        self.num_conformers, remove_hs=self.remove_hs, tries=self.matching_tries, skip_matching=self.skip_matching)
+        except Exception as e:
+            print(f'Skipping {name} because of the error:')
+            print(e)
+            return None
+        if self.split != 'train':
+            other_positions = [complex_graph['ligand'].orig_pos]
+            nsplit = name.split('_')
+            for i in range(100):
+                new_file = os.path.join(self.moad_dir, 'pdb_ligand', f'{nsplit[0]}_{nsplit[1]}_{nsplit[2]}_{i}.pdb')
+                if os.path.exists(new_file):
+                    if i != int(nsplit[3]):
+                        lig = Chem.MolFromPDBFile(new_file)
+                        lig = RemoveHs(lig, sanitize=True)
+                        other_positions.append(lig.GetConformer().GetPositions())
+                else:
+                    break
+            complex_graph['ligand'].orig_pos = np.asarray(other_positions)
+        return complex_graph, lig
+def print_statistics(dataset):
+    statistics = ([], [], [], [], [], [])
+    receptor_sizes = []
+    for i in range(len(dataset)):
+        complex_graph = dataset[i]
+        lig_pos = complex_graph['ligand'].pos if torch.is_tensor(complex_graph['ligand'].pos) else complex_graph['ligand'].pos[0]
+        receptor_sizes.append(complex_graph['receptor'].pos.shape[0])
+        radius_protein = torch.max(torch.linalg.vector_norm(complex_graph['receptor'].pos, dim=1))
+        molecule_center = torch.mean(lig_pos, dim=0)
+        radius_molecule = torch.max(
+            torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1))
+        distance_center = torch.linalg.vector_norm(molecule_center)
+        statistics[0].append(radius_protein)
+        statistics[1].append(radius_molecule)
+        statistics[2].append(distance_center)
+        if "rmsd_matching" in complex_graph:
+            statistics[3].append(complex_graph.rmsd_matching)
+        else:
+            statistics[3].append(0)
+        statistics[4].append(int(complex_graph.random_coords) if "random_coords" in complex_graph else -1)
+        if "random_coords" in complex_graph and complex_graph.random_coords and "rmsd_matching" in complex_graph:
+            statistics[5].append(complex_graph.rmsd_matching)
+    if len(statistics[5]) == 0:
+        statistics[5].append(-1)
+    name = ['radius protein', 'radius molecule', 'distance protein-mol', 'rmsd matching', 'random coordinates', 'random rmsd matching']
+    print('Number of complexes: ', len(dataset))
+    for i in range(len(name)):
+        array = np.asarray(statistics[i])
+        print(f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}")
+    return

datasets/parse_chi.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# From Nick Polizzi
+import numpy as np
+from collections import defaultdict
+import prody as pr
+import os
+from datasets.constants import chi, atom_order, aa_long2short, aa_short2aa_idx, aa_idx2aa_short
+def get_dihedral_indices(resname, chi_num):
+    """Return the atom indices for the specified dihedral angle.
+    """
+    if resname not in chi:
+        return np.array([np.nan]*4)
+    if chi_num not in chi[resname]:
+        return np.array([np.nan]*4)
+    return np.array([atom_order[resname].index(x) for x in chi[resname][chi_num]])
+dihedral_indices = defaultdict(list)
+for aa in atom_order.keys():
+    for i in range(1, 5):
+        inds = get_dihedral_indices(aa, i)
+        dihedral_indices[aa].append(inds)
+    dihedral_indices[aa] = np.array(dihedral_indices[aa])
+def vector_batch(a, b):
+    return a - b
+def unit_vector_batch(v):
+    return v / np.linalg.norm(v, axis=1, keepdims=True)
+def dihedral_angle_batch(p):
+    b0 = vector_batch(p[:, 0], p[:, 1])
+    b1 = vector_batch(p[:, 1], p[:, 2])
+    b2 = vector_batch(p[:, 2], p[:, 3])
+    n1 = np.cross(b0, b1)
+    n2 = np.cross(b1, b2)
+    m1 = np.cross(n1, b1 / np.linalg.norm(b1, axis=1, keepdims=True))
+    x = np.sum(n1 * n2, axis=1)
+    y = np.sum(m1 * n2, axis=1)
+    deg = np.degrees(np.arctan2(y, x))
+    deg[deg < 0] += 360
+    return deg
+def batch_compute_dihedral_angles(sidechains):
+    sidechains_np = np.array(sidechains)
+    dihedral_angles = dihedral_angle_batch(sidechains_np)
+    return dihedral_angles
+def get_coords(prody_pdb):
+    resindices = sorted(set(prody_pdb.ca.getResindices()))
+    coords = np.full((len(resindices), 14, 3), np.nan)
+    for i, resind in enumerate(resindices):
+        sel = prody_pdb.select(f'resindex {resind}')
+        resname = sel.getResnames()[0]
+        for j, name in enumerate(atom_order[aa_long2short[resname] if resname in aa_long2short else 'X']):
+            sel_resnum_name = sel.select(f'name {name}')
+            if sel_resnum_name is not None:
+                coords[i, j, :] = sel_resnum_name.getCoords()[0]
+            else:
+                coords[i, j, :] = [np.nan, np.nan, np.nan]
+    return coords
+def get_onehot_sequence(seq):
+    onehot = np.zeros((len(seq), 20))
+    for i, aa in enumerate(seq):
+        idx = aa_short2aa_idx[aa] if aa in aa_short2aa_idx else 7 # 7 is the index for GLY
+        onehot[i, idx] = 1
+    return onehot
+def get_dihedral_indices(onehot_sequence):
+    return np.array([dihedral_indices[aa_idx2aa_short[aa_idx]] for aa_idx in np.where(onehot_sequence)[1]])
+def _get_chi_angles(coords, indices):
+    X = coords
+    Y = indices.astype(int)
+    N = coords.shape[0]
+    mask = np.isnan(indices)
+    Y[mask] = 0
+    Z = X[np.arange(N)[:, None, None], Y, :]
+    Z[mask] = np.nan
+    chi_angles = batch_compute_dihedral_angles(Z.reshape(-1, 4, 3)).reshape(N, 4)
+    return chi_angles
+def get_chi_angles(coords, seq, return_onehot=False):
+    """
+    Parameters
+    ----------
+    prody_pdb : prody.AtomGroup
+        prody pdb object or selection
+    return_coords : bool, optional
+        return coordinates of prody_pdb in (N, 14, 3) array format, by default False
+    return_onehot : bool, optional
+        return one-hot sequence of prody_pdb, by default False
+    Returns
+    -------
+    numpy array of shape (N, 4)
+        Array contains chi angles of sidechains in row-order of residue indices in prody_pdb.
+        If a chi angle is not defined for a residue, due to missing atoms or GLY / ALA, it is set to np.nan.
+    """
+    onehot = get_onehot_sequence(seq)
+    dihedral_indices = get_dihedral_indices(onehot)
+    if return_onehot:
+        return _get_chi_angles(coords, dihedral_indices), onehot
+    return _get_chi_angles(coords, dihedral_indices)
+def test_get_chi_angles(print_chi_angles=False):
+    # need internet connection of '6w70.pdb' in working directory
+    pdb = pr.parsePDB('6w70')
+    prody_pdb = pdb.select('chain A')
+    chi_angles = get_chi_angles(prody_pdb)
+    assert chi_angles.shape == (prody_pdb.ca.numAtoms(), 4)
+    assert chi_angles[0,0] < 56.0 and chi_angles[0,0] > 55.0
+    print('test_get_chi_angles passed')
+    try:
+        os.remove('6w70.pdb.gz')
+    except:
+        pass
+    if print_chi_angles:
+        print(chi_angles)
+    return True
+if __name__ == '__main__':
+    test_get_chi_angles(print_chi_angles=True)

datasets/pdb.py ADDED Viewed

	@@ -0,0 +1,536 @@

+# Significant contribution from Ben Fry
+import copy
+import os.path
+import pickle
+import random
+from multiprocessing import Pool
+import numpy as np
+import pandas as pd
+import torch
+from rdkit import Chem
+from rdkit.Chem import AllChem, MolFromSmiles
+from scipy.spatial.distance import pdist, squareform
+from torch_geometric.data import Dataset, HeteroData
+from torch_geometric.utils import subgraph
+from tqdm import tqdm
+from datasets.constants import aa_to_cg_indices, amino_acid_smiles, cg_rdkit_indices
+from datasets.parse_chi import aa_long2short, atom_order
+from datasets.process_mols import new_extract_receptor_structure, get_lig_graph, generate_conformer
+from utils.torsion import get_transformation_mask
+def read_strings_from_txt(path):
+    # every line will be one element of the returned list
+    with open(path) as file:
+        lines = file.readlines()
+        return [line.rstrip() for line in lines]
+def compute_num_ca_neighbors(coords, cg_coords, idx, is_valid_bb_node, max_dist=5, buffer_residue_num=7):
+    """
+    Counts number of residues with heavy atoms within max_dist (Angstroms) of this sidechain that are not
+    residues within +/- buffer_residue_num in primary sequence.
+    From Ben's code
+    Note: Gabriele removed the chain_index
+    """
+    # Extract coordinates of all residues in the protein.
+    bb_coords = coords
+    # Compute the indices that we should not consider interactions.
+    excluded_neighbors = [idx - x for x in reversed(range(0, buffer_residue_num+1)) if (idx - x) >= 0]
+    excluded_neighbors.extend([idx + x for x in range(1, buffer_residue_num+1)])
+    # Create indices of an N x M distance matrix where N is num BB nodes and M is num CG nodes.
+    e_idx = torch.stack([
+        torch.arange(bb_coords.shape[0]).unsqueeze(-1).expand((-1, cg_coords.shape[0])).flatten(),
+        torch.arange(cg_coords.shape[0]).unsqueeze(0).expand((bb_coords.shape[0], -1)).flatten()
+    ])
+    # Expand bb_coords and cg_coords into the same dimensionality.
+    bb_coords_exp = bb_coords[e_idx[0]]
+    cg_coords_exp = cg_coords[e_idx[1]].unsqueeze(1)
+    # Every row is distance of chemical group to each atom in backbone coordinate frame.
+    bb_exp_idces, _ = (torch.cdist(bb_coords_exp, cg_coords_exp).squeeze(-1) < max_dist).nonzero(as_tuple=True)
+    bb_idces_within_thresh = torch.unique(e_idx[0][bb_exp_idces])
+    # Only count residues that are not adjacent or origin in primary sequence and are valid backbone residues (fully resolved coordinate frame).
+    bb_idces_within_thresh = bb_idces_within_thresh[~torch.isin(bb_idces_within_thresh, torch.tensor(excluded_neighbors)) & is_valid_bb_node[bb_idces_within_thresh]]
+    return len(bb_idces_within_thresh)
+def identify_valid_vandermers(args):
+    """
+    Constructs a tensor containing all the number of contacts for each residue that can be sampled from for chemical groups.
+    By using every sidechain as a chemical group, we will load the actual chemical groups at training time.
+    These can be used to sample as probabilities once divided by the sum.
+    """
+    complex_graph, max_dist, buffer_residue_num = args
+    # Constructs a mask tracking whether index is a valid coordinate frame / residue label to train over.
+    #is_in_residue_vocabulary = torch.tensor([x in aa_short2long for x in data['seq']]).bool()
+    coords, seq = complex_graph.coords, complex_graph.seq
+    is_valid_bb_node = (coords[:, :4].isnan().sum(dim=(1,2)) == 0).bool() #* is_in_residue_vocabulary
+    valid_cg_idces = []
+    for idx, aa in enumerate(seq):
+        if aa not in aa_to_cg_indices:
+            valid_cg_idces.append(0)
+        else:
+            indices = aa_to_cg_indices[aa]
+            cg_coordinates = coords[idx][indices]
+            # remove chemical group residues that aren't fully resolved.
+            if torch.any(cg_coordinates.isnan()).item():
+                valid_cg_idces.append(0)
+                continue
+            nbr_count = compute_num_ca_neighbors(coords, cg_coordinates, idx, is_valid_bb_node,
+                                                 max_dist=max_dist, buffer_residue_num=buffer_residue_num)
+            valid_cg_idces.append(nbr_count)
+    return complex_graph.name, torch.tensor(valid_cg_idces)
+def fast_identify_valid_vandermers(coords, seq, max_dist=5, buffer_residue_num=7):
+    offset = 10000 + max_dist
+    R = coords.shape[0]
+    coords = coords.numpy().reshape(-1, 3)
+    pdist_mat = squareform(pdist(coords))
+    pdist_mat = pdist_mat.reshape((R, 14, R, 14))
+    pdist_mat = np.nan_to_num(pdist_mat, nan=offset)
+    pdist_mat = np.min(pdist_mat, axis=(1, 3))
+    # compute pairwise distances
+    pdist_mat = pdist_mat + np.diag(np.ones(len(seq)) * offset)
+    for i in range(1, buffer_residue_num+1):
+        pdist_mat += np.diag(np.ones(len(seq)-i) * offset, k=i) + np.diag(np.ones(len(seq)-i) * offset, k=-i)
+    # get number of residues that are within max_dist of each other
+    nbr_count = np.sum(pdist_mat < max_dist, axis=1)
+    return torch.tensor(nbr_count)
+def compute_cg_features(aa, aa_smile):
+    """
+    Given an amino acid and a smiles string returns the stacked tensor of chemical group atom encodings.
+    The order of the output tensor rows corresponds to the index the atoms appear in aa_to_cg_indices from constants.
+    """
+    # Handle any residues that we don't have chemical groups for (ex: GLY if not using bb_cnh and bb_cco)
+    aa_short = aa_long2short[aa]
+    if aa_short not in aa_to_cg_indices:
+        return None
+    # Create rdkit molecule from smiles string.
+    mol = Chem.MolFromSmiles(aa_smile)
+    complex_graph = HeteroData()
+    get_lig_graph(mol, complex_graph)
+    atoms_to_keep = torch.tensor([i for i, _ in cg_rdkit_indices[aa].items()]).long()
+    complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr = \
+        subgraph(atoms_to_keep, complex_graph['ligand', 'ligand'].edge_index, complex_graph['ligand', 'ligand'].edge_attr, relabel_nodes=True)
+    complex_graph['ligand'].x = complex_graph['ligand'].x[atoms_to_keep]
+    edge_mask, mask_rotate = get_transformation_mask(complex_graph)
+    complex_graph['ligand'].edge_mask = torch.tensor(edge_mask)
+    complex_graph['ligand'].mask_rotate = mask_rotate
+    return complex_graph
+class PDBSidechain(Dataset):
+    def __init__(self, root, transform=None, cache_path='data/cache', split='train', limit_complexes=0,
+                 receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, remove_hs=True, all_atoms=False,
+                 atom_radius=5, atom_max_neighbors=None, sequences_to_embeddings=None,
+                 knn_only_graph=True, multiplicity=1, vandermers_max_dist=5, vandermers_buffer_residue_num=7,
+                 vandermers_min_contacts=5, remove_second_segment=False, merge_clusters=1, vandermers_extraction=True,
+                 add_random_ligand=False):
+        super(PDBSidechain, self).__init__(root, transform)
+        assert remove_hs == True, "not implemented yet"
+        self.root = root
+        self.split = split
+        self.limit_complexes = limit_complexes
+        self.receptor_radius = receptor_radius
+        self.knn_only_graph = knn_only_graph
+        self.multiplicity = multiplicity
+        self.c_alpha_max_neighbors = c_alpha_max_neighbors
+        self.num_workers = num_workers
+        self.sequences_to_embeddings = sequences_to_embeddings
+        self.remove_second_segment = remove_second_segment
+        self.merge_clusters = merge_clusters
+        self.vandermers_extraction = vandermers_extraction
+        self.add_random_ligand = add_random_ligand
+        self.all_atoms = all_atoms
+        self.atom_radius = atom_radius
+        self.atom_max_neighbors = atom_max_neighbors
+        if vandermers_extraction:
+            self.cg_node_feature_lookup_dict = {aa_long2short[aa]: compute_cg_features(aa, aa_smile) for aa, aa_smile in
+                                           amino_acid_smiles.items()}
+        self.cache_path = os.path.join(cache_path, f'PDB3_limit{self.limit_complexes}_INDEX{self.split}'
+                                                        f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
+                                            + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
+                                            + ('' if not self.knn_only_graph else '_knnOnly'))
+        self.read_split()
+        if not self.check_all_proteins():
+            os.makedirs(self.cache_path, exist_ok=True)
+            self.preprocess()
+        self.vandermers_max_dist = vandermers_max_dist
+        self.vandermers_buffer_residue_num = vandermers_buffer_residue_num
+        self.vandermers_min_contacts = vandermers_min_contacts
+        self.collect_proteins()
+        filtered_proteins = []
+        if vandermers_extraction:
+            for complex_graph in tqdm(self.protein_graphs):
+                if complex_graph.name in self.vandermers and torch.any(self.vandermers[complex_graph.name] >= 10):
+                    filtered_proteins.append(complex_graph)
+            print(f"Computed vandermers and kept {len(filtered_proteins)} proteins out of {len(self.protein_graphs)}")
+        else:
+            filtered_proteins = self.protein_graphs
+        second_filter = []
+        for complex_graph in tqdm(filtered_proteins):
+            if sequences_to_embeddings is None or complex_graph.orig_seq in sequences_to_embeddings:
+                second_filter.append(complex_graph)
+        print(f"Checked embeddings available and kept {len(second_filter)} proteins out of {len(filtered_proteins)}")
+        self.protein_graphs = second_filter
+        # filter clusters that have no protein graphs
+        self.split_clusters = list(set([g.cluster for g in self.protein_graphs]))
+        self.cluster_to_complexes = {c: [] for c in self.split_clusters}
+        for p in self.protein_graphs:
+            self.cluster_to_complexes[p['cluster']].append(p)
+        self.split_clusters = [c for c in self.split_clusters if len(self.cluster_to_complexes[c]) > 0]
+        print("Total elements in set", len(self.split_clusters) * self.multiplicity // self.merge_clusters)
+        self.name_to_complex = {p.name: p for p in self.protein_graphs}
+        self.define_probabilities()
+        if self.add_random_ligand:
+            # read csv with all smiles
+            with open('data/smiles_list.csv', 'r') as f:
+                self.smiles_list = f.readlines()
+            self.smiles_list = [s.split(',')[0] for s in self.smiles_list]
+    def define_probabilities(self):
+        if not self.vandermers_extraction:
+            return
+        if self.vandermers_min_contacts is not None:
+            self.probabilities = torch.arange(1000) - self.vandermers_min_contacts + 1
+            self.probabilities[:self.vandermers_min_contacts] = 0
+        else:
+            with open('data/pdbbind_counts.pkl', 'rb') as f:
+                pdbbind_counts = pickle.load(f)
+            pdb_counts = torch.ones(1000)
+            for contacts in self.vandermers.values():
+                pdb_counts.index_add_(0, contacts, torch.ones(contacts.shape))
+            print(pdbbind_counts[:30])
+            print(pdb_counts[:30])
+            self.probabilities = pdbbind_counts / pdb_counts
+            self.probabilities[:7] = 0
+    def len(self):
+        return len(self.split_clusters) * self.multiplicity // self.merge_clusters
+    def get(self, idx=None, protein=None, smiles=None):
+        assert idx is not None or (protein is not None and smiles is not None), "provide idx or protein or smile"
+        if protein is None or smiles is None:
+            idx = idx % len(self.split_clusters)
+            if self.merge_clusters > 1:
+                idx = idx * self.merge_clusters
+                idx = idx + random.randint(0, self.merge_clusters - 1)
+                idx = min(idx, len(self.split_clusters) - 1)
+            cluster = self.split_clusters[idx]
+            protein_graph = copy.deepcopy(random.choice(self.cluster_to_complexes[cluster]))
+        else:
+            protein_graph = copy.deepcopy(self.name_to_complex[protein])
+        if self.sequences_to_embeddings is not None:
+            #print(self.sequences_to_embeddings[protein_graph.orig_seq].shape, len(protein_graph.orig_seq), protein_graph.to_keep.shape)
+            if len(protein_graph.orig_seq) != len(self.sequences_to_embeddings[protein_graph.orig_seq]):
+                print('problem with ESM embeddings')
+                return self.get(random.randint(0, self.len()))
+            lm_embeddings = self.sequences_to_embeddings[protein_graph.orig_seq][protein_graph.to_keep]
+            protein_graph['receptor'].x = torch.cat([protein_graph['receptor'].x, lm_embeddings], dim=1)
+        if self.vandermers_extraction:
+            # select sidechain to remove
+            vandermers_contacts = self.vandermers[protein_graph.name]
+            vandermers_probs = self.probabilities[vandermers_contacts].numpy()
+            if not np.any(vandermers_contacts.numpy() >= 10):
+                print('no vandarmers >= 10 retrying with new one')
+                return self.get(random.randint(0, self.len()))
+            sidechain_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))
+            # remove part of the sequence
+            residues_to_keep = np.ones(len(protein_graph.seq), dtype=bool)
+            residues_to_keep[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
+                             min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False
+            if self.remove_second_segment:
+                pos_idx = protein_graph['receptor'].pos[sidechain_idx]
+                limit_closeness = 10
+                far_enough = torch.sum((protein_graph['receptor'].pos - pos_idx[None, :]) ** 2, dim=-1) > limit_closeness ** 2
+                vandermers_probs = vandermers_probs * far_enough.float().numpy()
+                vandermers_probs[max(0, sidechain_idx - self.vandermers_buffer_residue_num):
+                                 min(sidechain_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = 0
+                if np.all(vandermers_probs<=0):
+                    print('no second vandermer available retrying with new one')
+                    return self.get(random.randint(0, self.len()))
+                sc2_idx = np.random.choice(np.arange(len(vandermers_probs)), p=vandermers_probs / np.sum(vandermers_probs))
+                residues_to_keep[max(0, sc2_idx - self.vandermers_buffer_residue_num):
+                                 min(sc2_idx + self.vandermers_buffer_residue_num + 1, len(protein_graph.seq))] = False
+            residues_to_keep = torch.from_numpy(residues_to_keep)
+            protein_graph['receptor'].pos = protein_graph['receptor'].pos[residues_to_keep]
+            protein_graph['receptor'].x = protein_graph['receptor'].x[residues_to_keep]
+            protein_graph['receptor'].side_chain_vecs = protein_graph['receptor'].side_chain_vecs[residues_to_keep]
+            protein_graph['receptor', 'rec_contact', 'receptor'].edge_index = \
+                subgraph(residues_to_keep, protein_graph['receptor', 'rec_contact', 'receptor'].edge_index, relabel_nodes=True)[0]
+            # create the sidechain ligand
+            sidechain_aa = protein_graph.seq[sidechain_idx]
+            ligand_graph = self.cg_node_feature_lookup_dict[sidechain_aa]
+            ligand_graph['ligand'].pos = protein_graph.coords[sidechain_idx][protein_graph.mask[sidechain_idx]]
+            for type in ligand_graph.node_types + ligand_graph.edge_types:
+                for key, value in ligand_graph[type].items():
+                    protein_graph[type][key] = value
+            protein_graph['ligand'].orig_pos = protein_graph['ligand'].pos.numpy()
+            protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
+            protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
+            protein_graph['ligand'].pos = protein_graph['ligand'].pos - protein_center
+            protein_graph.original_center = protein_center
+            protein_graph['receptor_name'] = protein_graph.name
+        else:
+            protein_center = torch.mean(protein_graph['receptor'].pos, dim=0, keepdim=True)
+            protein_graph['receptor'].pos = protein_graph['receptor'].pos - protein_center
+            protein_graph.original_center = protein_center
+            protein_graph['receptor_name'] = protein_graph.name
+        if self.add_random_ligand:
+            if smiles is not None:
+                mol = MolFromSmiles(smiles)
+                try:
+                    generate_conformer(mol)
+                except Exception as e:
+                    print("failed to generate the given ligand returning None", e)
+                    return None
+            else:
+                success = False
+                while not success:
+                    smiles = random.choice(self.smiles_list)
+                    mol = MolFromSmiles(smiles)
+                    try:
+                        success = not generate_conformer(mol)
+                    except Exception as e:
+                        print(e, "changing ligand")
+            lig_graph = HeteroData()
+            get_lig_graph(mol, lig_graph)
+            edge_mask, mask_rotate = get_transformation_mask(lig_graph)
+            lig_graph['ligand'].edge_mask = torch.tensor(edge_mask)
+            lig_graph['ligand'].mask_rotate = mask_rotate
+            lig_graph['ligand'].smiles = smiles
+            lig_graph['ligand'].pos = lig_graph['ligand'].pos - torch.mean(lig_graph['ligand'].pos, dim=0, keepdim=True)
+            for type in lig_graph.node_types + lig_graph.edge_types:
+                for key, value in lig_graph[type].items():
+                    protein_graph[type][key] = value
+        for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq', 'to_keep', 'chain_ids']:
+            if hasattr(protein_graph, a):
+                delattr(protein_graph, a)
+            if hasattr(protein_graph['receptor'], a):
+                delattr(protein_graph['receptor'], a)
+        return protein_graph
+    def read_split(self):
+        # read CSV file
+        df = pd.read_csv(self.root + "/list.csv")
+        print("Loaded list CSV file")
+        # get clusters and filter by split
+        if self.split == "train":
+            val_clusters = set(read_strings_from_txt(self.root + "/valid_clusters.txt"))
+            test_clusters = set(read_strings_from_txt(self.root + "/test_clusters.txt"))
+            clusters = df["CLUSTER"].unique()
+            clusters = [int(c) for c in clusters if c not in val_clusters and c not in test_clusters]
+        elif self.split == "val":
+            clusters = [int(s) for s in read_strings_from_txt(self.root + "/valid_clusters.txt")]
+        elif self.split == "test":
+            clusters = [int(s) for s in read_strings_from_txt(self.root + "/test_clusters.txt")]
+        else:
+            raise ValueError("Split must be train, val or test")
+        print(self.split, "clusters", len(clusters))
+        clusters = set(clusters)
+        self.chains_in_cluster = []
+        complexes_in_cluster = set()
+        for chain, cluster in zip(df["CHAINID"], df["CLUSTER"]):
+            if cluster not in clusters:
+                continue
+            # limit to one chain per complex
+            if chain[:4] not in complexes_in_cluster:
+                self.chains_in_cluster.append((chain, cluster))
+                complexes_in_cluster.add(chain[:4])
+        print("Filtered chains in cluster", len(self.chains_in_cluster))
+        if self.limit_complexes > 0:
+            self.chains_in_cluster = self.chains_in_cluster[:self.limit_complexes]
+    def check_all_proteins(self):
+        for i in range(len(self.chains_in_cluster)//10000+1):
+            if not os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
+                return False
+        return True
+    def collect_proteins(self):
+        self.protein_graphs = []
+        self.vandermers = {}
+        total_recovered = 0
+        print(f'Loading {len(self.chains_in_cluster)} protein graphs.')
+        list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'rb') as f:
+                print(i)
+                l = pickle.load(f)
+                total_recovered += len(l)
+                self.protein_graphs.extend(l)
+            if not self.vandermers_extraction:
+                continue
+            if os.path.exists(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl')):
+                with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'rb') as f:
+                    vandermers = pickle.load(f)
+                    self.vandermers.update(vandermers)
+                continue
+            vandermers = {}
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(l), desc=f'computing vandermers {i}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                arguments = zip(l, [self.vandermers_max_dist] * len(l),
+                                [self.vandermers_buffer_residue_num] * len(l))
+                for t in map_fn(identify_valid_vandermers, arguments):
+                    if t is not None:
+                        vandermers[t[0]] = t[1]
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            with open(os.path.join(self.cache_path, f'vandermers{i}_{self.vandermers_max_dist}_{self.vandermers_buffer_residue_num}.pkl'), 'wb') as f:
+                pickle.dump(vandermers, f)
+            self.vandermers.update(vandermers)
+        print(f"Kept {len(self.protein_graphs)} proteins out of {len(self.chains_in_cluster)} total")
+        return
+    def preprocess(self):
+        # running preprocessing in parallel on multiple workers and saving the progress every 10000 proteins
+        list_indices = list(range(len(self.chains_in_cluster) // 10000 + 1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            if os.path.exists(os.path.join(self.cache_path, f"protein_graphs{i}.pkl")):
+                continue
+            chains_names = self.chains_in_cluster[10000 * i:10000 * (i + 1)]
+            protein_graphs = []
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(chains_names),
+                      desc=f'loading protein batch {i}/{len(self.chains_in_cluster) // 10000 + 1}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                for t in map_fn(self.load_chain, chains_names):
+                    if t is not None:
+                        protein_graphs.append(t)
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            with open(os.path.join(self.cache_path, f"protein_graphs{i}.pkl"), 'wb') as f:
+                pickle.dump(protein_graphs, f)
+        print("Finished preprocessing and saving protein graphs")
+    def load_chain(self, c):
+        chain, cluster = c
+        if not os.path.exists(self.root + f"/pdb/{chain[1:3]}/{chain}.pt"):
+            print("File not found", chain)
+            return None
+        data = torch.load(self.root + f"/pdb/{chain[1:3]}/{chain}.pt")
+        complex_graph = HeteroData()
+        complex_graph['name'] = chain
+        orig_seq = data["seq"]
+        coords = data["xyz"]
+        mask = data["mask"].bool()
+        # remove residues with NaN backbone coordinates
+        to_keep = torch.logical_not(torch.any(torch.isnan(coords[:, :4, 0]), dim=1))
+        coords = coords[to_keep]
+        seq = ''.join(np.asarray(list(orig_seq))[to_keep.numpy()].tolist())
+        mask = mask[to_keep]
+        if len(coords) == 0:
+            print("All coords were NaN", chain)
+            return None
+        try:
+            new_extract_receptor_structure(seq, coords.numpy(), complex_graph=complex_graph, neighbor_cutoff=self.receptor_radius,
+                                           max_neighbors=self.c_alpha_max_neighbors, knn_only_graph=self.knn_only_graph,
+                                           all_atoms=self.all_atoms, atom_cutoff=self.atom_radius,
+                                           atom_max_neighbors=self.atom_max_neighbors)
+        except Exception as e:
+            print("Error in extracting receptor", chain)
+            print(e)
+            return None
+        if torch.any(torch.isnan(complex_graph['receptor'].pos)):
+            print("NaN in pos receptor", chain)
+            return None
+        complex_graph.coords = coords
+        complex_graph.seq = seq
+        complex_graph.mask = mask
+        complex_graph.cluster = cluster
+        complex_graph.orig_seq = orig_seq
+        complex_graph.to_keep = to_keep
+        return complex_graph
+if __name__ == "__main__":
+    dataset = PDBSidechain(root="data/pdb_2021aug02_sample", split="train", multiplicity=1, limit_complexes=150)
+    print(len(dataset))
+    print(dataset[0])
+    for p in dataset:
+        print(p)
+        pass

datasets/pdbbind.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import binascii
+import glob
+import os
+import pickle
+from collections import defaultdict
+from multiprocessing import Pool
+import random
+import copy
+import torch.nn.functional as F
+import numpy as np
+import torch
+from rdkit import Chem
+from rdkit.Chem import MolFromSmiles, AddHs
+from torch_geometric.data import Dataset, HeteroData
+from torch_geometric.transforms import BaseTransform
+from tqdm import tqdm
+from rdkit.Chem import RemoveAllHs
+from datasets.process_mols import read_molecule, get_lig_graph_with_matching, generate_conformer, moad_extract_receptor_structure
+from utils.diffusion_utils import modify_conformer, set_time
+from utils.utils import read_strings_from_txt, crop_beyond
+from utils import so3, torus
+class NoiseTransform(BaseTransform):
+    def __init__(self, t_to_sigma, no_torsion, all_atom, alpha=1, beta=1,
+                 include_miscellaneous_atoms=False, crop_beyond_cutoff=None, time_independent=False, rmsd_cutoff=0,
+                 minimum_t=0, sampling_mixing_coeff=0):
+        self.t_to_sigma = t_to_sigma
+        self.no_torsion = no_torsion
+        self.all_atom = all_atom
+        self.include_miscellaneous_atoms = include_miscellaneous_atoms
+        self.minimum_t = minimum_t
+        self.mixing_coeff = sampling_mixing_coeff
+        self.alpha = alpha
+        self.beta = beta
+        self.crop_beyond_cutoff = crop_beyond_cutoff
+        self.rmsd_cutoff = rmsd_cutoff
+        self.time_independent = time_independent
+    def __call__(self, data):
+        t_tr, t_rot, t_tor, t = self.get_time()
+        return self.apply_noise(data, t_tr, t_rot, t_tor, t)
+    def get_time(self):
+        if self.time_independent:
+            t = np.random.beta(self.alpha, self.beta)
+            t_tr, t_rot, t_tor = t,t,t
+        else:
+            t = None
+            if self.mixing_coeff == 0:
+                t = np.random.beta(self.alpha, self.beta)
+                t = self.minimum_t + t * (1 - self.minimum_t)
+            else:
+                choice = np.random.binomial(1, self.mixing_coeff)
+                t1 = np.random.beta(self.alpha, self.beta)
+                t1 = t1 * self.minimum_t
+                t2 = np.random.beta(self.alpha, self.beta)
+                t2 = self.minimum_t + t2 * (1 - self.minimum_t)
+                t = choice * t1 + (1 - choice) * t2
+            t_tr, t_rot, t_tor = t,t,t
+        return t_tr, t_rot, t_tor, t
+    def apply_noise(self, data, t_tr, t_rot, t_tor, t, tr_update = None, rot_update=None, torsion_updates=None):
+        if not torch.is_tensor(data['ligand'].pos):
+            data['ligand'].pos = random.choice(data['ligand'].pos)
+        if self.time_independent:
+            orig_complex_graph = copy.deepcopy(data)
+        tr_sigma, rot_sigma, tor_sigma = self.t_to_sigma(t_tr, t_rot, t_tor)
+        if self.time_independent:
+            set_time(data, 0, 0, 0, 0, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
+        else:
+            set_time(data, t, t_tr, t_rot, t_tor, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
+        tr_update = torch.normal(mean=0, std=tr_sigma, size=(1, 3)) if tr_update is None else tr_update
+        rot_update = so3.sample_vec(eps=rot_sigma) if rot_update is None else rot_update
+        torsion_updates = np.random.normal(loc=0.0, scale=tor_sigma, size=data['ligand'].edge_mask.sum()) if torsion_updates is None else torsion_updates
+        torsion_updates = None if self.no_torsion else torsion_updates
+        try:
+            modify_conformer(data, tr_update, torch.from_numpy(rot_update).float(), torsion_updates)
+        except Exception as e:
+            print("failed modify conformer")
+            print(e)
+        if self.time_independent:
+            if self.no_torsion:
+                orig_complex_graph['ligand'].orig_pos = (orig_complex_graph['ligand'].pos.cpu().numpy() + orig_complex_graph.original_center.cpu().numpy())
+            filterHs = torch.not_equal(data['ligand'].x[:, 0], 0).cpu().numpy()
+            if isinstance(orig_complex_graph['ligand'].orig_pos, list):
+                orig_complex_graph['ligand'].orig_pos = orig_complex_graph['ligand'].orig_pos[0]
+            ligand_pos = data['ligand'].pos.cpu().numpy()[filterHs]
+            orig_ligand_pos = orig_complex_graph['ligand'].orig_pos[filterHs] - orig_complex_graph.original_center.cpu().numpy()
+            rmsd = np.sqrt(((ligand_pos - orig_ligand_pos) ** 2).sum(axis=1).mean(axis=0))
+            data.y = torch.tensor(rmsd < self.rmsd_cutoff).float().unsqueeze(0)
+            data.atom_y = data.y
+            return data
+        data.tr_score = -tr_update / tr_sigma ** 2
+        data.rot_score = torch.from_numpy(so3.score_vec(vec=rot_update, eps=rot_sigma)).float().unsqueeze(0)
+        data.tor_score = None if self.no_torsion else torch.from_numpy(torus.score(torsion_updates, tor_sigma)).float()
+        data.tor_sigma_edge = None if self.no_torsion else np.ones(data['ligand'].edge_mask.sum()) * tor_sigma
+        if data['ligand'].pos.shape[0] == 1:
+            # if the ligand is a single atom, the rotational score is always 0
+            data.rot_score = data.rot_score * 0
+        if self.crop_beyond_cutoff is not None:
+            crop_beyond(data, tr_sigma * 3 + self.crop_beyond_cutoff, self.all_atom)
+        set_time(data, t, t_tr, t_rot, t_tor, 1, self.all_atom, device=None, include_miscellaneous_atoms=self.include_miscellaneous_atoms)
+        return data
+class PDBBind(Dataset):
+    def __init__(self, root, transform=None, cache_path='data/cache', split_path='data/', limit_complexes=0, chain_cutoff=10,
+                 receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, popsize=15, maxiter=15,
+                 matching=True, keep_original=False, max_lig_size=None, remove_hs=False, num_conformers=1, all_atoms=False,
+                 atom_radius=5, atom_max_neighbors=None, esm_embeddings_path=None, require_ligand=False,
+                 include_miscellaneous_atoms=False,
+                 protein_path_list=None, ligand_descriptions=None, keep_local_structures=False,
+                 protein_file="protein_processed", ligand_file="ligand",
+                 knn_only_graph=False, matching_tries=1, dataset='PDBBind'):
+        super(PDBBind, self).__init__(root, transform)
+        self.pdbbind_dir = root
+        self.include_miscellaneous_atoms = include_miscellaneous_atoms
+        self.max_lig_size = max_lig_size
+        self.split_path = split_path
+        self.limit_complexes = limit_complexes
+        self.chain_cutoff = chain_cutoff
+        self.receptor_radius = receptor_radius
+        self.num_workers = num_workers
+        self.c_alpha_max_neighbors = c_alpha_max_neighbors
+        self.remove_hs = remove_hs
+        self.esm_embeddings_path = esm_embeddings_path
+        self.use_old_wrong_embedding_order = False
+        self.require_ligand = require_ligand
+        self.protein_path_list = protein_path_list
+        self.ligand_descriptions = ligand_descriptions
+        self.keep_local_structures = keep_local_structures
+        self.protein_file = protein_file
+        self.fixed_knn_radius_graph = True
+        self.knn_only_graph = knn_only_graph
+        self.matching_tries = matching_tries
+        self.ligand_file = ligand_file
+        self.dataset = dataset
+        assert knn_only_graph or (not all_atoms)
+        self.all_atoms = all_atoms
+        if matching or protein_path_list is not None and ligand_descriptions is not None:
+            cache_path += '_torsion'
+        if all_atoms:
+            cache_path += '_allatoms'
+        self.full_cache_path = os.path.join(cache_path, f'{dataset}3_limit{self.limit_complexes}'
+                                                        f'_INDEX{os.path.splitext(os.path.basename(self.split_path))[0]}'
+                                                        f'_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}'
+                                                        f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
+                                                        f'_chainCutoff{self.chain_cutoff if self.chain_cutoff is None else int(self.chain_cutoff)}'
+                                            + (''if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
+                                            + (''if not matching or num_conformers == 1 else f'_confs{num_conformers}')
+                                            + ('' if self.esm_embeddings_path is None else f'_esmEmbeddings')
+                                            + '_full'
+                                            + ('' if not keep_local_structures else f'_keptLocalStruct')
+                                            + ('' if protein_path_list is None or ligand_descriptions is None else str(binascii.crc32(''.join(ligand_descriptions + protein_path_list).encode())))
+                                            + ('' if protein_file == "protein_processed" else '_' + protein_file)
+                                            + ('' if not self.fixed_knn_radius_graph else (f'_fixedKNN' if not self.knn_only_graph else '_fixedKNNonly'))
+                                            + ('' if not self.include_miscellaneous_atoms else '_miscAtoms')
+                                            + ('' if self.use_old_wrong_embedding_order else '_chainOrd')
+                                            + ('' if self.matching_tries == 1 else f'_tries{matching_tries}'))
+        self.popsize, self.maxiter = popsize, maxiter
+        self.matching, self.keep_original = matching, keep_original
+        self.num_conformers = num_conformers
+        self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
+        if not self.check_all_complexes():
+            os.makedirs(self.full_cache_path, exist_ok=True)
+            if protein_path_list is None or ligand_descriptions is None:
+                self.preprocessing()
+            else:
+                self.inference_preprocessing()
+        self.complex_graphs, self.rdkit_ligands = self.collect_all_complexes()
+        print_statistics(self.complex_graphs)
+        list_names = [complex['name'] for complex in self.complex_graphs]
+        with open(os.path.join(self.full_cache_path, f'pdbbind_{os.path.splitext(os.path.basename(self.split_path))[0][:3]}_names.txt'), 'w') as f:
+            f.write('\n'.join(list_names))
+    def len(self):
+        return len(self.complex_graphs)
+    def get(self, idx):
+        complex_graph = copy.deepcopy(self.complex_graphs[idx])
+        if self.require_ligand:
+            complex_graph.mol = RemoveAllHs(copy.deepcopy(self.rdkit_ligands[idx]))
+        for a in ['random_coords', 'coords', 'seq', 'sequence', 'mask', 'rmsd_matching', 'cluster', 'orig_seq', 'to_keep', 'chain_ids']:
+            if hasattr(complex_graph, a):
+                delattr(complex_graph, a)
+            if hasattr(complex_graph['receptor'], a):
+                delattr(complex_graph['receptor'], a)
+        return complex_graph
+    def preprocessing(self):
+        print(f'Processing complexes from [{self.split_path}] and saving it to [{self.full_cache_path}]')
+        complex_names_all = read_strings_from_txt(self.split_path)
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        print(f'Loading {len(complex_names_all)} complexes.')
+        if self.esm_embeddings_path is not None:
+            id_to_embeddings = torch.load(self.esm_embeddings_path)
+            chain_embeddings_dictlist = defaultdict(list)
+            chain_indices_dictlist = defaultdict(list)
+            for key, embedding in id_to_embeddings.items():
+                key_name = key.split('_chain_')[0]
+                if key_name in complex_names_all:
+                    chain_embeddings_dictlist[key_name].append(embedding)
+                    chain_indices_dictlist[key_name].append(int(key.split('_chain_')[1]))
+            lm_embeddings_chains_all = []
+            for name in complex_names_all:
+                complex_chains_embeddings = chain_embeddings_dictlist[name]
+                complex_chains_indices = chain_indices_dictlist[name]
+                chain_reorder_idx = np.argsort(complex_chains_indices)
+                reordered_chains = [complex_chains_embeddings[i] for i in chain_reorder_idx]
+                lm_embeddings_chains_all.append(reordered_chains)
+        else:
+            lm_embeddings_chains_all = [None] * len(complex_names_all)
+        # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+        list_indices = list(range(len(complex_names_all)//1000+1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
+                continue
+            complex_names = complex_names_all[1000*i:1000*(i+1)]
+            lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
+            complex_graphs, rdkit_ligands = [], []
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(complex_names), desc=f'loading complexes {i}/{len(complex_names_all)//1000+1}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                for t in map_fn(self.get_complex, zip(complex_names, lm_embeddings_chains, [None] * len(complex_names), [None] * len(complex_names))):
+                    complex_graphs.extend(t[0])
+                    rdkit_ligands.extend(t[1])
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
+                pickle.dump((complex_graphs), f)
+            with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
+                pickle.dump((rdkit_ligands), f)
+    def inference_preprocessing(self):
+        ligands_list = []
+        print('Reading molecules and generating local structures with RDKit')
+        for ligand_description in tqdm(self.ligand_descriptions):
+            mol = MolFromSmiles(ligand_description)  # check if it is a smiles or a path
+            if mol is not None:
+                mol = AddHs(mol)
+                generate_conformer(mol)
+                ligands_list.append(mol)
+            else:
+                mol = read_molecule(ligand_description, remove_hs=False, sanitize=True)
+                if not self.keep_local_structures:
+                    mol.RemoveAllConformers()
+                    mol = AddHs(mol)
+                    generate_conformer(mol)
+                ligands_list.append(mol)
+        if self.esm_embeddings_path is not None:
+            print('Reading language model embeddings.')
+            lm_embeddings_chains_all = []
+            if not os.path.exists(self.esm_embeddings_path): raise Exception('ESM embeddings path does not exist: ',self.esm_embeddings_path)
+            for protein_path in self.protein_path_list:
+                embeddings_paths = sorted(glob.glob(os.path.join(self.esm_embeddings_path, os.path.basename(protein_path)) + '*'))
+                lm_embeddings_chains = []
+                for embeddings_path in embeddings_paths:
+                    lm_embeddings_chains.append(torch.load(embeddings_path)['representations'][33])
+                lm_embeddings_chains_all.append(lm_embeddings_chains)
+        else:
+            lm_embeddings_chains_all = [None] * len(self.protein_path_list)
+        print('Generating graphs for ligands and proteins')
+        # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+        list_indices = list(range(len(self.protein_path_list)//1000+1))
+        random.shuffle(list_indices)
+        for i in list_indices:
+            if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
+                continue
+            protein_paths_chunk = self.protein_path_list[1000*i:1000*(i+1)]
+            ligand_description_chunk = self.ligand_descriptions[1000*i:1000*(i+1)]
+            ligands_chunk = ligands_list[1000 * i:1000 * (i + 1)]
+            lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
+            complex_graphs, rdkit_ligands = [], []
+            if self.num_workers > 1:
+                p = Pool(self.num_workers, maxtasksperchild=1)
+                p.__enter__()
+            with tqdm(total=len(protein_paths_chunk), desc=f'loading complexes {i}/{len(protein_paths_chunk)//1000+1}') as pbar:
+                map_fn = p.imap_unordered if self.num_workers > 1 else map
+                for t in map_fn(self.get_complex, zip(protein_paths_chunk, lm_embeddings_chains, ligands_chunk,ligand_description_chunk)):
+                    complex_graphs.extend(t[0])
+                    rdkit_ligands.extend(t[1])
+                    pbar.update()
+            if self.num_workers > 1: p.__exit__(None, None, None)
+            with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
+                pickle.dump((complex_graphs), f)
+            with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
+                pickle.dump((rdkit_ligands), f)
+    def check_all_complexes(self):
+        if os.path.exists(os.path.join(self.full_cache_path, f"heterographs.pkl")):
+            return True
+        complex_names_all = read_strings_from_txt(self.split_path)
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        for i in range(len(complex_names_all) // 1000 + 1):
+            if not os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
+                return False
+        return True
+    def collect_all_complexes(self):
+        print('Collecting all complexes from cache', self.full_cache_path)
+        if os.path.exists(os.path.join(self.full_cache_path, f"heterographs.pkl")):
+            with open(os.path.join(self.full_cache_path, "heterographs.pkl"), 'rb') as f:
+                complex_graphs = pickle.load(f)
+            if self.require_ligand:
+                with open(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), 'rb') as f:
+                    rdkit_ligands = pickle.load(f)
+            else:
+                rdkit_ligands = None
+            return complex_graphs, rdkit_ligands
+        complex_names_all = read_strings_from_txt(self.split_path)
+        if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[:self.limit_complexes]
+        complex_graphs_all = []
+        for i in range(len(complex_names_all) // 1000 + 1):
+            with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'rb') as f:
+                print(i)
+                l = pickle.load(f)
+                complex_graphs_all.extend(l)
+        rdkit_ligands_all = []
+        for i in range(len(complex_names_all) // 1000 + 1):
+            with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
+                l = pickle.load(f)
+                rdkit_ligands_all.extend(l)
+        return complex_graphs_all, rdkit_ligands_all
+    def get_complex(self, par):
+        name, lm_embedding_chains, ligand, ligand_description = par
+        if not os.path.exists(os.path.join(self.pdbbind_dir, name)) and ligand is None:
+            print("Folder not found", name)
+            return [], []
+        try:
+            lig = read_mol(self.pdbbind_dir, name, suffix=self.ligand_file, remove_hs=False)
+            if self.max_lig_size != None and lig.GetNumHeavyAtoms() > self.max_lig_size:
+                print(f'Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data.')
+                return [], []
+            complex_graph = HeteroData()
+            complex_graph['name'] = name
+            get_lig_graph_with_matching(lig, complex_graph, self.popsize, self.maxiter, self.matching, self.keep_original,
+                                        self.num_conformers, remove_hs=self.remove_hs, tries=self.matching_tries)
+            moad_extract_receptor_structure(path=os.path.join(self.pdbbind_dir, name, f'{name}_{self.protein_file}.pdb'),
+                                            complex_graph=complex_graph,
+                                            neighbor_cutoff=self.receptor_radius,
+                                            max_neighbors=self.c_alpha_max_neighbors,
+                                            lm_embeddings=lm_embedding_chains,
+                                            knn_only_graph=self.knn_only_graph,
+                                            all_atoms=self.all_atoms,
+                                            atom_cutoff=self.atom_radius,
+                                            atom_max_neighbors=self.atom_max_neighbors)
+        except Exception as e:
+            print(f'Skipping {name} because of the error:')
+            print(e)
+            return [], []
+        if self.dataset == 'posebusters':
+            other_positions = []
+            all_mol_file = os.path.join(self.pdbbind_dir, name, f'{name}_ligands.sdf')
+            supplier = Chem.SDMolSupplier(all_mol_file, sanitize=False, removeHs=False)
+            for mol in supplier:
+                Chem.SanitizeMol(mol)
+                all_mol = RemoveAllHs(mol)
+                for conf in all_mol.GetConformers():
+                    other_positions.append(conf.GetPositions())
+            print(f'Found {len(other_positions)} alternative poses for {name}')
+            complex_graph['ligand'].orig_pos = np.asarray(other_positions)
+        protein_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
+        complex_graph['receptor'].pos -= protein_center
+        if self.all_atoms:
+            complex_graph['atom'].pos -= protein_center
+        if (not self.matching) or self.num_conformers == 1:
+            complex_graph['ligand'].pos -= protein_center
+        else:
+            for p in complex_graph['ligand'].pos:
+                p -= protein_center
+        complex_graph.original_center = protein_center
+        complex_graph['receptor_name'] = name
+        return [complex_graph], [lig]
+def print_statistics(complex_graphs):
+    statistics = ([], [], [], [], [], [])
+    receptor_sizes = []
+    for complex_graph in complex_graphs:
+        lig_pos = complex_graph['ligand'].pos if torch.is_tensor(complex_graph['ligand'].pos) else complex_graph['ligand'].pos[0]
+        receptor_sizes.append(complex_graph['receptor'].pos.shape[0])
+        radius_protein = torch.max(torch.linalg.vector_norm(complex_graph['receptor'].pos, dim=1))
+        molecule_center = torch.mean(lig_pos, dim=0)
+        radius_molecule = torch.max(
+            torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1))
+        distance_center = torch.linalg.vector_norm(molecule_center)
+        statistics[0].append(radius_protein)
+        statistics[1].append(radius_molecule)
+        statistics[2].append(distance_center)
+        if "rmsd_matching" in complex_graph:
+            statistics[3].append(complex_graph.rmsd_matching)
+        else:
+            statistics[3].append(0)
+        statistics[4].append(int(complex_graph.random_coords) if "random_coords" in complex_graph else -1)
+        if "random_coords" in complex_graph and complex_graph.random_coords and "rmsd_matching" in complex_graph:
+            statistics[5].append(complex_graph.rmsd_matching)
+    if len(statistics[5]) == 0:
+        statistics[5].append(-1)
+    name = ['radius protein', 'radius molecule', 'distance protein-mol', 'rmsd matching', 'random coordinates', 'random rmsd matching']
+    print('Number of complexes: ', len(complex_graphs))
+    for i in range(len(name)):
+        array = np.asarray(statistics[i])
+        print(f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}")
+    return
+def read_mol(pdbbind_dir, name, suffix='ligand', remove_hs=False):
+    lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_{suffix}.sdf'), remove_hs=remove_hs, sanitize=True)
+    if lig is None:  # read mol2 file if sdf file cannot be sanitized
+        lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_{suffix}.mol2'), remove_hs=remove_hs, sanitize=True)
+    return lig
+def read_mols(pdbbind_dir, name, remove_hs=False):
+    ligs = []
+    for file in os.listdir(os.path.join(pdbbind_dir, name)):
+        if file.endswith(".sdf") and 'rdkit' not in file:
+            lig = read_molecule(os.path.join(pdbbind_dir, name, file), remove_hs=remove_hs, sanitize=True)
+            if lig is None and os.path.exists(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2")):  # read mol2 file if sdf file cannot be sanitized
+                print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
+                lig = read_molecule(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2"), remove_hs=remove_hs, sanitize=True)
+            if lig is not None:
+                ligs.append(lig)
+    return ligs

datasets/process_mols.py ADDED Viewed

	@@ -0,0 +1,499 @@

+import copy
+import warnings
+from pathlib import Path
+import numpy as np
+import torch
+from Bio.PDB import PDBParser
+from rdkit import Chem
+from rdkit.Chem.rdchem import BondType as BT
+from rdkit.Chem import AllChem, GetPeriodicTable, RemoveHs
+from rdkit.Geometry import Point3D
+from torch import cdist
+from torch_cluster import knn_graph
+import prody as pr
+import torch.nn.functional as F
+from datasets.conformer_matching import get_torsion_angles, optimize_rotatable_bonds
+from datasets.constants import aa_short2long, atom_order, three_to_one
+from datasets.parse_chi import get_chi_angles, get_coords, aa_idx2aa_short, get_onehot_sequence
+from utils.torsion import get_transformation_mask
+from utils.logging_utils import get_logger
+logger = get_logger()
+periodic_table = GetPeriodicTable()
+allowable_features = {
+    'possible_atomic_num_list': list(range(1, 119)) + ['misc'],
+    'possible_chirality_list': [
+        'CHI_UNSPECIFIED',
+        'CHI_TETRAHEDRAL_CW',
+        'CHI_TETRAHEDRAL_CCW',
+        'CHI_OTHER'
+    ],
+    'possible_degree_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'misc'],
+    'possible_numring_list': [0, 1, 2, 3, 4, 5, 6, 'misc'],
+    'possible_implicit_valence_list': [0, 1, 2, 3, 4, 5, 6, 'misc'],
+    'possible_formal_charge_list': [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 'misc'],
+    'possible_numH_list': [0, 1, 2, 3, 4, 5, 6, 7, 8, 'misc'],
+    'possible_number_radical_e_list': [0, 1, 2, 3, 4, 'misc'],
+    'possible_hybridization_list': [
+        'SP', 'SP2', 'SP3', 'SP3D', 'SP3D2', 'misc'
+    ],
+    'possible_is_aromatic_list': [False, True],
+    'possible_is_in_ring3_list': [False, True],
+    'possible_is_in_ring4_list': [False, True],
+    'possible_is_in_ring5_list': [False, True],
+    'possible_is_in_ring6_list': [False, True],
+    'possible_is_in_ring7_list': [False, True],
+    'possible_is_in_ring8_list': [False, True],
+    'possible_amino_acids': ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET',
+                             'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'HIP', 'HIE', 'TPO', 'HID', 'LEV', 'MEU',
+                             'PTR', 'GLV', 'CYT', 'SEP', 'HIZ', 'CYM', 'GLM', 'ASQ', 'TYS', 'CYX', 'GLZ', 'misc'],
+    'possible_atom_type_2': ['C*', 'CA', 'CB', 'CD', 'CE', 'CG', 'CH', 'CZ', 'N*', 'ND', 'NE', 'NH', 'NZ', 'O*', 'OD',
+                             'OE', 'OG', 'OH', 'OX', 'S*', 'SD', 'SG', 'misc'],
+    'possible_atom_type_3': ['C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2',
+                             'CZ', 'CZ2', 'CZ3', 'N', 'ND1', 'ND2', 'NE', 'NE1', 'NE2', 'NH1', 'NH2', 'NZ', 'O', 'OD1',
+                             'OD2', 'OE1', 'OE2', 'OG', 'OG1', 'OH', 'OXT', 'SD', 'SG', 'misc'],
+}
+bonds = {BT.SINGLE: 0, BT.DOUBLE: 1, BT.TRIPLE: 2, BT.AROMATIC: 3}
+lig_feature_dims = (list(map(len, [
+    allowable_features['possible_atomic_num_list'],
+    allowable_features['possible_chirality_list'],
+    allowable_features['possible_degree_list'],
+    allowable_features['possible_formal_charge_list'],
+    allowable_features['possible_implicit_valence_list'],
+    allowable_features['possible_numH_list'],
+    allowable_features['possible_number_radical_e_list'],
+    allowable_features['possible_hybridization_list'],
+    allowable_features['possible_is_aromatic_list'],
+    allowable_features['possible_numring_list'],
+    allowable_features['possible_is_in_ring3_list'],
+    allowable_features['possible_is_in_ring4_list'],
+    allowable_features['possible_is_in_ring5_list'],
+    allowable_features['possible_is_in_ring6_list'],
+    allowable_features['possible_is_in_ring7_list'],
+    allowable_features['possible_is_in_ring8_list'],
+])), 0)  # number of scalar features
+rec_atom_feature_dims = (list(map(len, [
+    allowable_features['possible_amino_acids'],
+    allowable_features['possible_atomic_num_list'],
+    allowable_features['possible_atom_type_2'],
+    allowable_features['possible_atom_type_3'],
+])), 0)
+rec_residue_feature_dims = (list(map(len, [
+    allowable_features['possible_amino_acids']
+])), 0)
+def lig_atom_featurizer(mol):
+    ringinfo = mol.GetRingInfo()
+    atom_features_list = []
+    for idx, atom in enumerate(mol.GetAtoms()):
+        chiral_tag = str(atom.GetChiralTag())
+        if chiral_tag  in ['CHI_SQUAREPLANAR', 'CHI_TRIGONALBIPYRAMIDAL', 'CHI_OCTAHEDRAL']:
+            chiral_tag = 'CHI_OTHER'
+        atom_features_list.append([
+            safe_index(allowable_features['possible_atomic_num_list'], atom.GetAtomicNum()),
+            allowable_features['possible_chirality_list'].index(str(chiral_tag)),
+            safe_index(allowable_features['possible_degree_list'], atom.GetTotalDegree()),
+            safe_index(allowable_features['possible_formal_charge_list'], atom.GetFormalCharge()),
+            safe_index(allowable_features['possible_implicit_valence_list'], atom.GetImplicitValence()),
+            safe_index(allowable_features['possible_numH_list'], atom.GetTotalNumHs()),
+            safe_index(allowable_features['possible_number_radical_e_list'], atom.GetNumRadicalElectrons()),
+            safe_index(allowable_features['possible_hybridization_list'], str(atom.GetHybridization())),
+            allowable_features['possible_is_aromatic_list'].index(atom.GetIsAromatic()),
+            safe_index(allowable_features['possible_numring_list'], ringinfo.NumAtomRings(idx)),
+            allowable_features['possible_is_in_ring3_list'].index(ringinfo.IsAtomInRingOfSize(idx, 3)),
+            allowable_features['possible_is_in_ring4_list'].index(ringinfo.IsAtomInRingOfSize(idx, 4)),
+            allowable_features['possible_is_in_ring5_list'].index(ringinfo.IsAtomInRingOfSize(idx, 5)),
+            allowable_features['possible_is_in_ring6_list'].index(ringinfo.IsAtomInRingOfSize(idx, 6)),
+            allowable_features['possible_is_in_ring7_list'].index(ringinfo.IsAtomInRingOfSize(idx, 7)),
+            allowable_features['possible_is_in_ring8_list'].index(ringinfo.IsAtomInRingOfSize(idx, 8)),
+            #g_charge if not np.isnan(g_charge) and not np.isinf(g_charge) else 0.
+        ])
+    return torch.tensor(atom_features_list)
+def safe_index(l, e):
+    """ Return index of element e in list l. If e is not present, return the last index """
+    try:
+        return l.index(e)
+    except:
+        return len(l) - 1
+def moad_extract_receptor_structure(path, complex_graph, neighbor_cutoff=20, max_neighbors=None, sequences_to_embeddings=None,
+                                    knn_only_graph=False, lm_embeddings=None, all_atoms=False, atom_cutoff=None, atom_max_neighbors=None):
+    # load the entire pdb file
+    pdb = pr.parsePDB(str(path))
+    seq = pdb.ca.getSequence()
+    coords = get_coords(pdb)
+    one_hot = get_onehot_sequence(seq)
+    chain_ids = np.zeros(len(one_hot))
+    res_chain_ids = pdb.ca.getChids()
+    res_seg_ids = pdb.ca.getSegnames()
+    res_chain_ids = np.asarray([s + c for s, c in zip(res_seg_ids, res_chain_ids)])
+    ids = np.unique(res_chain_ids)
+    sequences = []
+    lm_embeddings = lm_embeddings if sequences_to_embeddings is None else []
+    for i, id in enumerate(ids):
+        chain_ids[res_chain_ids == id] = i
+        s = np.argmax(one_hot[res_chain_ids == id], axis=1)
+        s = ''.join([aa_idx2aa_short[aa_idx] for aa_idx in s])
+        sequences.append(s)
+        if sequences_to_embeddings is not None:
+            lm_embeddings.append(sequences_to_embeddings[s])
+    complex_graph['receptor'].sequence = sequences
+    complex_graph['receptor'].chain_ids = torch.from_numpy(np.asarray(chain_ids)).long()
+    new_extract_receptor_structure(seq, coords, complex_graph, neighbor_cutoff=neighbor_cutoff, max_neighbors=max_neighbors,
+                                   lm_embeddings=lm_embeddings, knn_only_graph=knn_only_graph, all_atoms=all_atoms,
+                                   atom_cutoff=atom_cutoff, atom_max_neighbors=atom_max_neighbors)
+def new_extract_receptor_structure(seq, all_coords, complex_graph, neighbor_cutoff=20, max_neighbors=None, lm_embeddings=None,
+                                   knn_only_graph=False, all_atoms=False, atom_cutoff=None, atom_max_neighbors=None):
+    chi_angles, one_hot = get_chi_angles(all_coords, seq, return_onehot=True)
+    n_rel_pos, c_rel_pos = all_coords[:, 0, :] - all_coords[:, 1, :], all_coords[:, 2, :] - all_coords[:, 1, :]
+    side_chain_vecs = torch.from_numpy(np.concatenate([chi_angles / 360, n_rel_pos, c_rel_pos], axis=1))
+    # Build the k-NN graph
+    coords = torch.tensor(all_coords[:, 1, :], dtype=torch.float)
+    if len(coords) > 3000:
+        raise ValueError(f'The receptor is too large {len(coords)}')
+    if knn_only_graph:
+        edge_index = knn_graph(coords, k=max_neighbors if max_neighbors else 32)
+    else:
+        distances = cdist(coords, coords)
+        src_list = []
+        dst_list = []
+        for i in range(len(coords)):
+            dst = list(np.where(distances[i, :] < neighbor_cutoff)[0])
+            dst.remove(i)
+            max_neighbors = max_neighbors if max_neighbors else 1000
+            if max_neighbors != None and len(dst) > max_neighbors:
+                dst = list(np.argsort(distances[i, :]))[1: max_neighbors + 1]
+            if len(dst) == 0:
+                dst = list(np.argsort(distances[i, :]))[1:2]  # choose second because first is i itself
+                print(
+                    f'The cutoff {neighbor_cutoff} was too small for one atom such that it had no neighbors. '
+                    f'So we connected it to the closest other atom')
+            assert i not in dst
+            src = [i] * len(dst)
+            src_list.extend(src)
+            dst_list.extend(dst)
+        edge_index = torch.from_numpy(np.asarray([dst_list, src_list]))
+    res_names_list = [aa_short2long[seq[i]] if seq[i] in aa_short2long else 'misc' for i in range(len(seq))]
+    feature_list = [[safe_index(allowable_features['possible_amino_acids'], res)] for res in res_names_list]
+    node_feat = torch.tensor(feature_list, dtype=torch.float32)
+    lm_embeddings = torch.tensor(np.concatenate(lm_embeddings, axis=0)) if lm_embeddings is not None else None
+    complex_graph['receptor'].x = torch.cat([node_feat, lm_embeddings], axis=1) if lm_embeddings is not None else node_feat
+    complex_graph['receptor'].pos = coords
+    complex_graph['receptor'].side_chain_vecs = side_chain_vecs.float()
+    complex_graph['receptor', 'rec_contact', 'receptor'].edge_index = edge_index
+    if all_atoms:
+        atom_coords = all_coords.reshape(-1, 3)
+        atom_coords = torch.from_numpy(atom_coords[~np.any(np.isnan(atom_coords), axis=1)]).float()
+        if knn_only_graph:
+            atoms_edge_index = knn_graph(atom_coords, k=atom_max_neighbors if atom_max_neighbors else 1000)
+        else:
+            atoms_distances = cdist(atom_coords, atom_coords)
+            atom_src_list = []
+            atom_dst_list = []
+            for i in range(len(atom_coords)):
+                dst = list(np.where(atoms_distances[i, :] < atom_cutoff)[0])
+                dst.remove(i)
+                max_neighbors = atom_max_neighbors if atom_max_neighbors else 1000
+                if max_neighbors != None and len(dst) > max_neighbors:
+                    dst = list(np.argsort(atoms_distances[i, :]))[1: max_neighbors + 1]
+                if len(dst) == 0:
+                    dst = list(np.argsort(atoms_distances[i, :]))[1:2]  # choose second because first is i itself
+                    print(
+                        f'The atom_cutoff {atom_cutoff} was too small for one atom such that it had no neighbors. '
+                        f'So we connected it to the closest other atom')
+                assert i not in dst
+                src = [i] * len(dst)
+                atom_src_list.extend(src)
+                atom_dst_list.extend(dst)
+            atoms_edge_index = torch.from_numpy(np.asarray([atom_dst_list, atom_src_list]))
+        feats = [get_moad_atom_feats(res, all_coords[i]) for i, res in enumerate(seq)]
+        atom_feat = torch.from_numpy(np.concatenate(feats, axis=0)).float()
+        c_alpha_idx = np.concatenate([np.zeros(len(f)) + i for i, f in enumerate(feats)])
+        np_array = np.stack([np.arange(len(atom_feat)), c_alpha_idx])
+        atom_res_edge_index = torch.from_numpy(np_array).long()
+        complex_graph['atom'].x = atom_feat
+        complex_graph['atom'].pos = atom_coords
+        assert len(complex_graph['atom'].x) == len(complex_graph['atom'].pos)
+        complex_graph['atom', 'atom_contact', 'atom'].edge_index = atoms_edge_index
+        complex_graph['atom', 'atom_rec_contact', 'receptor'].edge_index = atom_res_edge_index
+    return
+def get_moad_atom_feats(res, coords):
+    feats = []
+    res_long = aa_short2long[res]
+    res_order = atom_order[res]
+    for i, c in enumerate(coords):
+        if np.any(np.isnan(c)):
+            continue
+        atom_feats = []
+        if res == '-':
+            atom_feats = [safe_index(allowable_features['possible_amino_acids'], 'misc'),
+                     safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
+                     safe_index(allowable_features['possible_atom_type_2'], 'misc'),
+                     safe_index(allowable_features['possible_atom_type_3'], 'misc')]
+        else:
+            atom_feats.append(safe_index(allowable_features['possible_amino_acids'], res_long))
+            if i >= len(res_order):
+                atom_feats.extend([safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
+                                   safe_index(allowable_features['possible_atom_type_2'], 'misc'),
+                                   safe_index(allowable_features['possible_atom_type_3'], 'misc')])
+            else:
+                atom_name = res_order[i]
+                try:
+                    atomic_num = periodic_table.GetAtomicNumber(atom_name[:1])
+                except:
+                    print("element", res_order[i][:1], 'not found')
+                    atomic_num = -1
+                atom_feats.extend([safe_index(allowable_features['possible_atomic_num_list'], atomic_num),
+                                   safe_index(allowable_features['possible_atom_type_2'], (atom_name + '*')[:2]),
+                                   safe_index(allowable_features['possible_atom_type_3'], atom_name)])
+        feats.append(atom_feats)
+    feats = np.asarray(feats)
+    return feats
+def get_lig_graph(mol, complex_graph):
+    atom_feats = lig_atom_featurizer(mol)
+    row, col, edge_type = [], [], []
+    for bond in mol.GetBonds():
+        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        row += [start, end]
+        col += [end, start]
+        edge_type += 2 * [bonds[bond.GetBondType()]] if bond.GetBondType() != BT.UNSPECIFIED else [0, 0]
+    edge_index = torch.tensor([row, col], dtype=torch.long)
+    edge_type = torch.tensor(edge_type, dtype=torch.long)
+    edge_attr = F.one_hot(edge_type, num_classes=len(bonds)).to(torch.float)
+    complex_graph['ligand'].x = atom_feats
+    complex_graph['ligand', 'lig_bond', 'ligand'].edge_index = edge_index
+    complex_graph['ligand', 'lig_bond', 'ligand'].edge_attr = edge_attr
+    if mol.GetNumConformers() > 0:
+        lig_coords = torch.from_numpy(mol.GetConformer().GetPositions()).float()
+        complex_graph['ligand'].pos = lig_coords
+    return
+def generate_conformer(mol):
+    ps = AllChem.ETKDGv2()
+    failures, id = 0, -1
+    while failures < 3 and id == -1:
+        if failures > 0:
+            logger.debug(f'rdkit coords could not be generated. trying again {failures}.')
+        id = AllChem.EmbedMolecule(mol, ps)
+        failures += 1
+    if id == -1:
+        logger.info('rdkit coords could not be generated without using random coords. using random coords now.')
+        ps.useRandomCoords = True
+        AllChem.EmbedMolecule(mol, ps)
+        AllChem.MMFFOptimizeMolecule(mol, confId=0)
+        return True
+    #else:
+    #    AllChem.MMFFOptimizeMolecule(mol, confId=0)
+    return False
+def get_lig_graph_with_matching(mol_, complex_graph, popsize, maxiter, matching, keep_original, num_conformers, remove_hs, tries=10, skip_matching=False):
+    if matching:
+        mol_maybe_noh = copy.deepcopy(mol_)
+        if remove_hs:
+            mol_maybe_noh = RemoveHs(mol_maybe_noh, sanitize=True)
+            mol_maybe_noh = AllChem.RemoveAllHs(mol_maybe_noh)
+        if keep_original:
+            positions = []
+            for conf in mol_maybe_noh.GetConformers():
+                positions.append(conf.GetPositions())
+            complex_graph['ligand'].orig_pos = np.asarray(positions) if len(positions) > 1 else positions[0]
+        # rotatable_bonds = get_torsion_angles(mol_maybe_noh)
+        _tmp = copy.deepcopy(mol_)
+        if remove_hs:
+            _tmp = RemoveHs(_tmp, sanitize=True)
+        _tmp = AllChem.RemoveAllHs(_tmp)
+        rotatable_bonds = get_torsion_angles(_tmp)
+        for i in range(num_conformers):
+            mols, rmsds = [], []
+            for _ in range(tries):
+                mol_rdkit = copy.deepcopy(mol_)
+                mol_rdkit.RemoveAllConformers()
+                mol_rdkit = AllChem.AddHs(mol_rdkit)
+                generate_conformer(mol_rdkit)
+                if remove_hs:
+                    mol_rdkit = RemoveHs(mol_rdkit, sanitize=True)
+                mol_rdkit = AllChem.RemoveAllHs(mol_rdkit)
+                mol = AllChem.RemoveAllHs(copy.deepcopy(mol_maybe_noh))
+                if rotatable_bonds and not skip_matching:
+                    optimize_rotatable_bonds(mol_rdkit, mol, rotatable_bonds, popsize=popsize, maxiter=maxiter)
+                mol.AddConformer(mol_rdkit.GetConformer())
+                rms_list = []
+                AllChem.AlignMolConformers(mol, RMSlist=rms_list)
+                mol_rdkit.RemoveAllConformers()
+                mol_rdkit.AddConformer(mol.GetConformers()[1])
+                mols.append(mol_rdkit)
+                rmsds.append(rms_list[0])
+            # select molecule with lowest rmsd
+            #print("mean std min max", np.mean(rmsds), np.std(rmsds), np.min(rmsds), np.max(rmsds))
+            mol_rdkit = mols[np.argmin(rmsds)]
+            if i == 0:
+                complex_graph.rmsd_matching = min(rmsds)
+                get_lig_graph(mol_rdkit, complex_graph)
+            else:
+                if torch.is_tensor(complex_graph['ligand'].pos):
+                    complex_graph['ligand'].pos = [complex_graph['ligand'].pos]
+                complex_graph['ligand'].pos.append(torch.from_numpy(mol_rdkit.GetConformer().GetPositions()).float())
+    else:  # no matching
+        complex_graph.rmsd_matching = 0
+        if remove_hs: mol_ = RemoveHs(mol_)
+        get_lig_graph(mol_, complex_graph)
+    edge_mask, mask_rotate = get_transformation_mask(complex_graph)
+    complex_graph['ligand'].edge_mask = torch.tensor(edge_mask)
+    complex_graph['ligand'].mask_rotate = mask_rotate
+    return
+def get_rec_misc_atom_feat(bio_atom=None, atom_name=None, element=None, get_misc_features=False):
+    if get_misc_features:
+        return [safe_index(allowable_features['possible_amino_acids'], 'misc'),
+                 safe_index(allowable_features['possible_atomic_num_list'], 'misc'),
+                 safe_index(allowable_features['possible_atom_type_2'], 'misc'),
+                 safe_index(allowable_features['possible_atom_type_3'], 'misc')]
+    if atom_name is not None:
+        atom_name = atom_name
+    else:
+        atom_name = bio_atom.name
+    if element is not None:
+        element = element
+    else:
+        element = bio_atom.element
+    if element == 'CD':
+        element = 'C'
+    assert not element == ''
+    try:
+        atomic_num = periodic_table.GetAtomicNumber(element.lower().capitalize())
+    except:
+        atomic_num = -1
+    atom_feat = [safe_index(allowable_features['possible_amino_acids'], bio_atom.get_parent().get_resname()),
+                 safe_index(allowable_features['possible_atomic_num_list'], atomic_num),
+                 safe_index(allowable_features['possible_atom_type_2'], (atom_name + '*')[:2]),
+                 safe_index(allowable_features['possible_atom_type_3'], atom_name)]
+    return atom_feat
+def write_mol_with_coords(mol, new_coords, path):
+    w = Chem.SDWriter(path)
+    conf = mol.GetConformer()
+    for i in range(mol.GetNumAtoms()):
+        x,y,z = new_coords.astype(np.double)[i]
+        conf.SetAtomPosition(i,Point3D(x,y,z))
+    w.write(mol)
+    w.close()
+def create_mol_with_coords(mol, new_coords, path=None):
+    conf = mol.GetConformer()
+    for i in range(mol.GetNumAtoms()):
+        x, y, z = new_coords[i]
+        conf.SetAtomPosition(i, Point3D(float(x), float(y), float(z)))
+    if path:
+        w = Chem.SDWriter(path)
+        w.write(mol)
+        w.close()
+    return mol
+def read_molecule(ligand_description, sanitize=False, calc_charges=False, remove_hs=False, remove_confs=False):
+    mol = None
+    # Check if ligand_description is a path to a file
+    if Path(ligand_description).is_absolute() or len(Path(ligand_description).parts) > 1:
+        path = Path(ligand_description)
+        if path.is_file():
+            match path.suffix:
+                case '.mol':
+                    mol =  Chem.MolFromMolFile(str(path), sanitize=False, removeHs=True)
+                case '.mol2':
+                    mol = Chem.MolFromMol2File(str(path), sanitize=False, removeHs=False)
+                case '.sdf':
+                    supplier = Chem.SDMolSupplier(str(path), sanitize=False, removeHs=False)
+                    mol = supplier[0]
+                case '.pdbqt':
+                    with open(path) as file:
+                        pdbqt_data = file.readlines()
+                    pdb_block = ''
+                    for line in pdbqt_data:
+                        pdb_block += '{}\n'.format(line[:66])
+                    mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False)
+                case '.pdb':
+                    mol = Chem.MolFromPDBFile(str(path), sanitize=False, removeHs=False)
+                case _:
+                    logger.warning(f'Expect the format of the molecule file to be '
+                                     f'one of .mol2, .sdf, .pdbqt and .pdb, got {ligand_description}')
+        else:
+            raise FileNotFoundError(f'File {ligand_description} not found.')
+    else:
+        mol = Chem.MolFromSmiles(ligand_description)
+        # No need to remove conformers if the molecule is not read from a file
+        remove_confs = False
+    if mol is not None:
+        try:
+            if sanitize or calc_charges:
+                Chem.SanitizeMol(mol)
+            if calc_charges:
+                # Compute Gasteiger charges on the molecule.
+                try:
+                    AllChem.ComputeGasteigerCharges(mol)
+                except:
+                    warnings.warn('Unable to compute charges for the molecule.')
+            if remove_hs:
+                mol = Chem.RemoveHs(mol, sanitize=sanitize)
+            if remove_confs:
+                mol.RemoveAllConformers()
+        except Exception as e:
+            # Print stacktrace
+            import traceback
+            msg = traceback.format_exc()
+            logger.warning(f"Failed to process molecule: {ligand_description}\n{msg}")
+            return None
+    return mol

datasets/sidechain_esm_embeddings_to_pt.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import pickle
+from argparse import ArgumentParser
+import torch
+from tqdm import tqdm
+parser = ArgumentParser()
+parser.add_argument('--esm_embeddings_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new', help='')
+parser.add_argument('--output_path', type=str, default='data/BindingMOAD_2020_ab_processed_biounit/moad_sequences_new.pt', help='')
+args = parser.parse_args()
+dic = {}
+# read text file with all sequences
+with open('data/pdb_2021aug02/sequences_to_id.fasta') as f:
+    lines = f.readlines()
+# read sequences
+with open('data/pdb_2021aug02/useful_sequences.pkl', 'rb') as f:
+    sequences = pickle.load(f)
+ids = set()
+dict_seq_id = {seq[:-1]: str(id) for id, seq in enumerate(lines)}
+for i, seq in tqdm(enumerate(sequences)):
+    ids.add(dict_seq_id[seq])
+    if i == 20000: break
+print("total", len(ids), "out of", len(os.listdir(args.esm_embeddings_path)))
+available = set([filename.split('.')[0] for filename in os.listdir(args.esm_embeddings_path)])
+final = available.intersection(ids)
+for idp in tqdm(final):
+    dic[idp] = torch.load(os.path.join(args.esm_embeddings_path, idp+'.pt'))['representations'][33]
+torch.save(dic,args.output_path)

inference.py CHANGED Viewed

@@ -45,7 +45,6 @@ from utils.diffusion_utils import t_to_sigma as t_to_sigma_compl, get_t_schedule
 from utils.inference_utils import InferenceDataset
 from utils.sampling import randomize_position, sampling
 from utils.utils import get_model
-from utils.visualise import PDBFile
 from tqdm import tqdm
 configure_logger()
@@ -622,7 +621,7 @@ def extract_pockets(protein_path, ligand_residue=None, top_pockets=None):
     # Run fpocket
     distance = 2.5
     min_size = 30
-    args = ['fpocket', '-d',  '-f', tmp_protein_path, '-D', str(distance), '-i', str(min_size)]
     if ligand_residue is not None:
         args += ['-r', ligand_residue]
     print(args)

 from utils.inference_utils import InferenceDataset
 from utils.sampling import randomize_position, sampling
 from utils.utils import get_model
 from tqdm import tqdm
 configure_logger()
     # Run fpocket
     distance = 2.5
     min_size = 30
+    args = ['./fpocket', '-d',  '-f', tmp_protein_path, '-D', str(distance), '-i', str(min_size)]
     if ligand_residue is not None:
         args += ['-r', ligand_residue]
     print(args)

requirements.txt CHANGED Viewed

	@@ -0,0 +1,30 @@

+# deep learning
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.4
+lightning==2.4
+torch-geometric
+https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_cluster-1.6.3%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
+https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_scatter-2.1.2%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
+https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_sparse-0.6.18%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
+https://data.pyg.org/whl/torch-2.4.0%2Bcu118/torch_spline_conv-1.2.2%2Bpt24cu118-cp311-cp311-linux_x86_64.whl
+scipy
+fair-esm
+e3nn
+scikit-learn
+# configuration and utilities
+hydra-core
+pandas
+pandarallel
+panel
+seaborn
+apscheduler
+tinydb
+email-validator
+gradio_rangeslider
+# bio/cheminformatics
+biopython
+rdkit
+openbabel-wheel
+prody

resources/animations/example_3.gif DELETED Viewed

Binary file (493 kB)

resources/animations/example_6.gif DELETED Viewed

Binary file (750 kB)

resources/linker_size_distributions.png DELETED Viewed

Binary file (16.3 kB)

resources/moad_test_pdbs.txt DELETED Viewed

@@ -1,90 +0,0 @@
-1j78
-6c0b
-1j6z
-2uyi
-6cfg
-5ncg
-1esv
-3spf
-5yee
-1bxm
-3zln
-4b1w
-2gm1
-1yrs
-3ukr
-5xdv
-1efi
-1fd7
-2x2r
-2a5x
-1lri
-1rvx
-3ken
-6rvp
-5ncp
-3pa8
-3eks
-1nwk
-5n69
-3f7i
-4b1x
-7aki
-5neg
-2fl6
-2wog
-5ncf
-3sjh
-6qgk
-3zjx
-3gt9
-2x7c
-2uym
-2fky
-4dxd
-3zcw
-1dtl
-2q2y
-6vmz
-2vl8
-4b1v
-4b1y
-1sqk
-1eei
-2pg2
-3cjo
-3gta
-6pb9
-5zzb
-4pa0
-6hmy
-2i3i
-6rcj
-2fme
-4b1u
-2x7d
-3lkf
-2fl2
-2x7e
-6kvp
-2q0u
-3f7h
-6g6y
-3mn5
-1rdw
-1mfi
-5xdt
-6hmw
-1q0b
-5ndu
-6rcf
-6p7r
-5lzj
-1rvt
-1oke
-3k3b
-1lt6
-5xdu
-3l9h
-1rv0
-3eku

resources/moad_val_pdbs.txt DELETED Viewed

@@ -1,153 +0,0 @@
-2e27
-2ajy
-2bmk
-6npi
-1yec
-6lam
-25c8
-1xuo
-2ykl
-3f78
-2ajx
-6vh6
-4mrh
-2r2e
-3fo1
-4np2
-3gmm
-6lt6
-1mh5
-6npp
-6v3j
-2ajz
-1d6v
-1x9q
-1ngp
-1i7z
-2hxw
-2ica
-1gaf
-4mre
-1xdg
-1dl7
-3vrj
-2yip
-2bjm
-3vkx
-6be3
-2o5z
-3m6f
-2z92
-4l4v
-2hvk
-1wc7
-1c1e
-2c1p
-1usq
-1aj7
-6ooy
-1y0l
-1riv
-6pvc
-2yk1
-1yee
-4mrf
-2ddq
-4lcw
-1flr
-3t0w
-3fo2
-1qyg
-1vpo
-2ajv
-1ub5
-4k3h
-3cfb
-3t0x
-2omn
-5xqw
-5ln4
-1mrf
-6npm
-1t66
-1rum
-5u98
-5zqk
-2pye
-1eap
-1yei
-1rua
-1riu
-1xdd
-1oar
-5j6h
-6dzn
-1jgl
-1h8s
-1lo0
-1f3d
-3fjg
-1q0y
-1kn2
-2bfv
-6pvd
-3kdm
-1f4x
-1b09
-3if1
-1mjj
-6itp
-35c8
-2jkl
-1a6v
-3vri
-2jkj
-3fo0
-1a4k
-1lo2
-43ca
-1cfv
-1a0q
-6x42
-1rd4
-2dwe
-4np3
-3upr
-1yeg
-1kn4
-6lah
-1um5
-1mex
-6itq
-3bqm
-4f8l
-1yek
-1mj7
-1rul
-1um4
-1fl3
-4mrg
-4ia6
-2cgr
-2cju
-1ghq
-1c5c
-6lb2
-1jgu
-1fe8
-7jra
-4f8n
-6nux
-1ynk
-1bfv
-2pcp
-2z93
-1wz1
-2yio
-1yef
-3cfd
-1lo3
-1q72
-2o7n
-3fj7
-3oaz

resources/wehi_pains.csv DELETED Viewed

@@ -1,480 +0,0 @@
-"c:1:c:c(:c:c:c:1-[#6;X4]-c:2:c:c:c(:c:c:2)-[#7&H2,$([#7;!H0]-[#6;X4]),$([#7](-[#6X4])-[#6X4])])-[#7&H2,$([#7;!H0]-[#6;X4]),$([#7](-[#6X4])-[#6X4])]","<regId=anil_di_alk_F(14)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#6]=[#7]-[#7]-[#1]","<regId=hzone_anil(14)>"
-"c1(nn(c([c;!H0,$(c-[#6;!H0])]1)-[#8]-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#1])-[#6;X4]","<regId=het_5_pyrazole_OH(14)>"
-"c:2(:c:1-[#16]-c:3:c(-[#7;!H0,$([#7]-[CH3]),$([#7]-[#6;!H0;!H1]-[#6;!H0])](-c:1:c(:c(:c:2-[#1])-[#1])-[#1])):[c;!H0,$(c~[#7](-[#1])-[#6;X4]),$(c~[#6]:[#6])](:[c;!H0,$(c~[#6]:[#6])]:[c;!H0,$(c-[#7](-[#1])-[#1]),$(c-[#8]-[#6;X4])]:c:3-[#1]))-[#1]","<regId=het_thio_666_A(13)>"
-"[#6]-2-[#6]-c:1:c(:c:c:c:c:1)-[#6](-c:3:c:c:c:c:c-2:3)=[#6]-[#6]","<regId=styrene_A(13)>"
-"[#16]-1-[#6](=[#7]-[#6]:[#6])-[#7;!H0,$([#7]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]),$([#7]-[#6]:[#6])]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[$([#6]:[#6]:[#6]-[#17]),$([#6]:[!#6&!#1])]","<regId=ene_rhod_C(13)>"
-"[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#6]=[#6])-[#8]-1)-[#6](-[#1])-[#1]","<regId=dhp_amino_CN_A(13)>"
-"[#8]=[#16](=[#8])-[#6](-[#6]#[#7])=[#7]-[#7]-[#1]","<regId=cyano_imine_C(12)>"
-"c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_A(12)>"
-"c:1:c(:c:c:c:c:1)-[#7](-[#1])-c:2:c(:c(:c(:s:2)-[$([#6]=[#8]),$([#6]#[#7]),$([#6](-[#8]-[#1])=[#6])])-[#7])-[$([#6]#[#7]),$([#6](:[#7]):[#7])]","<regId=thiophene_amino_B(12)>"
-"[#6;X4]-1-[#6](=[#8])-[#7]-[#7]-[#6]-1=[#8]","<regId=keto_keto_beta_B(12)>"
-"c:1:c-3:c(:c:c:c:1)-[#6]:2:[#7]:[!#1]:[#6]:[#6]:[#6]:2-[#6]-3=[#8]","<regId=keto_phenone_A(11)>"
-"[#6]-1(-[#6](=[#6](-[#6]#[#7])-[#6](~[#8])~[#7]~[#6]-1~[#8])-[#6](-[#1])-[#1])=[#6](-[#1])-[#6]:[#6]","<regId=cyano_pyridone_C(11)>"
-"[#6]-1(=[#6](-!@[#6]=[#7])-[#16]-[#6](-[#7]-1)=[#8])-[$([F,Cl,Br,I]),$([#7+](:[#6]):[#6])]","<regId=thiaz_ene_C(11)>"
-"c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1]):[!#6&!#1]:[#6;!H0,$([#6]-[OH]),$([#6]-[#6;H2,H3])](:[#6]:2-[#6](-[#1])=[#7]-[#7](-[#1])-[$([#6]:1:[#7]:[#6]:[#6](-[#1]):[#16]:1),$([#6]:[#6](-[#1]):[#6]-[#1]),$([#6]:[#7]:[#6]:[#7]:[#6]:[#7]),$([#6]:[#7]:[#7]:[#7]:[#7])])","<regId=hzone_thiophene_A(11)>"
-"[!#1]:[!#1]-[#6;!H0,$([#6]-[#6]#[#7])]=[#6]-1-[#6]=,:[#6]-[#6](=[$([#8]),$([#7;!R])])-[#6]=,:[#6]-1","<regId=ene_quin_methide(10)>"
-"c:1:c:c-2:c(:c:c:1)-[#6]-[#6](-c:3:c(-[#16]-2):c(:c(-[#1]):[c;!H0,$(c-[#8]),$(c-[#16;X2]),$(c-[#6;X4]),$(c-[#7;H2,H3,$([#7!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])])](:c:3-[#1]))-[#1])-[#7;H2,H3,$([#7;!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])]","<regId=het_thio_676_A(10)>"
-"[#6]-1(=[#8])-[#6](=[#6](-[#1])-[$([#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1),$([#6]:1:[#6]:[#6]:[#6]:[!#6&!#1]:1)])-[#7]=[#6](-[!#1]:[!#1]:[!#1])-[$([#16]),$([#7]-[!#1]:[!#1])]-1","<regId=ene_five_het_G(10)>"
-"[#7+](:[!#1]:[!#1]:[!#1])-[!#1]=[#8]","<regId=acyl_het_A(9)>"
-"[#6;X4]-[#7](-[#6;X4])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6]2=,:[#7][#6]:[#6]:[!#1]2)-[#1])-[#1]","<regId=anil_di_alk_G(9)>"
-"[#7;!H0,$([#7]-[#6;X4])]-1-[#6]=,:[#6](-[#6](=[#8])-[#6]:[#6]:[#6])-[#6](-[#6])-[#6](=[#6]-1-[#6](-[#1])(-[#1])-[#1])-[$([#6]=[#8]),$([#6]#[#7])]","<regId=dhp_keto_A(9)>"
-"c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_B(9)>"
-"c:1:3:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2)-[#1]):n:c(-[#1]):n:3-[#6]","<regId=anil_alk_bim(9)>"
-"c:1:c:c-2:c(:c:c:1)-[#7]=[#6]-[#6]-2=[#7;!R]","<regId=imine_imine_A(9)>"
-"c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6](=[#8])-[#6]-,:2:[!#1]:[!#6&!#1]:[#6]:[#6]-,:2","<regId=thio_urea_C(9)>"
-"[#7;!R]=[#6]-2-[#6](=[#8])-c:1:c:c:c:c:c:1-[#16]-2","<regId=imine_one_fives_B(9)>"
-"[$([#7](-[#1])-[#1]),$([#8]-[#1])]-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-c:1:c(:n(-[#6]):n:c:1)-[#8]-2","<regId=dhp_amino_CN_B(9)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:n:c:1-[#1])-[#8]-c:2:c:c:c:c:c:2)-[#1])-[#1]","<regId=anil_OC_no_alk_A(8)>"
-"[#6](=[#8])-[#6]-1=[#6]-[#7]-c:2:c(-[#16]-1):c:c:c:c:2","<regId=het_thio_66_one(8)>"
-"c:1:c:c-2:c(:c:c:1)-[#6](-c:3:c(-[$([#16;X2]),$([#6;X4])]-2):c:c:[c;!H0,$(c-[#17]),$(c-[#6;X4])](:c:3))=[#6]-[#6]","<regId=styrene_B(8)>"
-"[#6](-[#1])(-[#1])-[#16;X2]-c:1:n:c(:c(:n:1-!@[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2)-[#1]","<regId=het_thio_5_A(8)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2=[#6](-[#1])-c:1:c(:c:c:c:c:1)-[#16;X2]-c:3:c-2:c:c:c:c:3","<regId=anil_di_alk_ene_A(8)>"
-"[#16]-1-[#6](=!@[#7;!H0,$([#7]-[#7](-[#1])-[#6]:[#6])])-[#7;!H0,$([#7]-[#6]:[#7]:[#6]:[#6]:[#16])]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[#6]:[#6]-[$([#17]),$([#8]-[#6]-[#1])]","<regId=ene_rhod_D(8)>"
-"[#16]-1-[#6](=[#8])-[#7]-[#6](=[#16])-[#6]-1=[#6](-[#1])-[#6]:[#6]","<regId=ene_rhod_E(8)>"
-"c:1:c(:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#7](-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#1])-[#1])-[#1]","<regId=anil_OH_alk_A(8)>"
-"n1(-[#6;X4])c(c(-[#1])c(c1-[#6]:[#6])-[#1])-[#6](-[#1])-[#1]","<regId=pyrrole_C(8)>"
-"c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-c:2:c:c:c:c:c:2","<regId=thio_urea_D(8)>"
-"[#7](-c:1:c:c:c:c:c:1)-c2[n+]c(cs2)-c:3:c:c:c:c:c:3","<regId=thiaz_ene_D(8)>"
-"n:1:c:c:c(:c:1-[#6](-[#1])-[#1])-[#6](-[#1])=[#6]-2-[#6](=[#8])-[#7]-[#6](=[!#6&!#1])-[#7]-2","<regId=ene_rhod_F(8)>"
-"[#6]-,:1(=,:[#6](-[#6](-[#1])(-[#6])-[#6])-,:[#16]-,:[#6](-,:[#7;!H0,$([#7]-[#6;!H0;!H1])]-,:1)=[#8])-[#16]-[#6;R]","<regId=thiaz_ene_E(8)>"
-"[!#1]:,-1:[!#1]-,:2:[!#1](:[!#1]:[!#1]:[!#1]:,-1)-,:[#7](-[#1])-,:[#7](-,:[#6]-,:2=[#8])-[#6]","<regId=het_65_B(7)>"
-"c:1:c:c-2:c(:c:c:1)-[#6](=[#6](-[#6]-2=[#8])-[#6])-[#8]-[#1]","<regId=keto_keto_beta_C(7)>"
-"c:2:c:c:1:n:n:c(:n:c:1:c:c:2)-[#6](-[#1])(-[#1])-[#6]=[#8]","<regId=het_66_A(7)>"
-"c:1:c:c:c:c:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:n:c:c:c:c:2","<regId=thio_urea_E(7)>"
-"[#6](-[#1])-[#6](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6]-[#6]-[#6]=[#8])-[$([#6](=[#8])-[#8]),$([#6]#[#7])])-[#6](-[#1])-[#1]","<regId=thiophene_amino_C(7)>"
-"[#6](-c:1:c(:c(:[c;!H0,$(c-[#6;X4])]:c:c:1-[#1])-[#1])-[#1])(-c:2:c(:c(:[c;!H0,$(c-[#17])](:c(:c:2-[#1])-[#1]))-[#1])-[#1])=[$([#7]-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]),$([#7]-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]),$([#7]-[#7](-[#1])-[#6](=[#7]-[#1])-[#7](-[#1])-[#1]),$([#6](-[#1])-[#7])]","<regId=hzone_phenone(7)>"
-"[#8](-[#1])-[#6](=[#8])-c:1:c:c(:c:c:c:1)-[#6]:[!#1]:[#6]-[#6](-[#1])=[#6]-2-[#6](=[!#6&!#1])-[#7]-[#6](=[!#6&!#1])-[!#6&!#1]-2","<regId=ene_rhod_G(7)>"
-"[#6]-1(=[#6]-[#6](-c:2:c:c(:c(:n:c-1:2)-[#7](-[#1])-[#1])-[#6]#[#7])=[#6])-[#6]#[#7]","<regId=ene_cyano_B(7)>"
-"[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#6]:[#6])-[#8]-1)-[#6]#[#7]","<regId=dhp_amino_CN_C(7)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6]=[#8])-[#6;X4]-[#6]-2=[#8]","<regId=het_5_A(7)>"
-"[#7]-1=[#6]-[#6](-[#6](-[#7]-1)=[#16])=[#6]","<regId=ene_five_het_H(6)>"
-"c1(coc(c1-[#1])-[#6](=[#16])-[#7]-2-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[!#1]-[#6](-[#1])(-[#1])-[#6]-2(-[#1])-[#1])-[#1]","<regId=thio_amide_A(6)>"
-"[#6]=[#6](-[#6]#[#7])-[#6](=[#7]-[#1])-[#7]-[#7]","<regId=ene_cyano_C(6)>"
-"c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]=[#7]-[#7](-[#1])-c:2:n:c:c:s:2","<regId=hzone_furan_A(6)>"
-"c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-[#6]:2:[#6]:[!#1]:[#6]:[#6]:[#6]:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_H(6)>"
-"n2c1ccccn1c(c2-[$([#6](-[!#1])=[#6](-[#1])-[#6]:[#6]),$([#6]:[#8]:[#6])])-[#7]-[#6]:[#6]","<regId=het_65_C(6)>"
-"[#6]-1-[#7](-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#7]-1-[#1]","<regId=thio_urea_F(6)>"
-"c:1(:c:c:c:o:1)-[#6](-[#1])=!@[#6]-3-[#6](=[#8])-c:2:c:c:c:c:c:2-[!#6&!#1]-3","<regId=ene_five_het_I(6)>"
-"[#8]=[#6]-1-[#6;X4]-[#6]-[#6](=[#8])-c:2:c:c:c:c:c-1:2","<regId=keto_keto_gamma(5)>"
-"c:1:c:c-2:c(:c:c:1)-[#6](-c3cccc4noc-2c34)=[#8]","<regId=quinone_B(5)>"
-"[#8](-[#1])-c:1:n:c(:c:c:c:1)-[#8]-[#1]","<regId=het_6_pyridone_OH(5)>"
-"c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#1])-[#6]=[#7]-[#7](-[#1])-[$([#6]:[#6]),$([#6]=[#16])])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=hzone_naphth_A(5)>"
-"[#6]-,:1=,:[#6](-,:[#16]-,:[#6](-,:[#6]=,:[#6]-,:1)=[#16])-,:[#7]","<regId=thio_ester_A(5)>"
-"[#6]-1=[#6]-[#6](-[#8]-[#6]-1-[#8])(-[#8])-[#6]","<regId=ene_misc_A(5)>"
-"[#8]=[#6]-,:1-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:[#7]-,:1)-,:[#6]=[#8])-[#6]#[#7]","<regId=cyano_pyridone_D(5)>"
-"c3cn1c(nc(c1-[#7]-[#6])-c:2:c:c:c:c:n:2)cc3","<regId=het_65_Db(5)>"
-"[#7]-2-c:1:c:c:c:c:c:1-[#6](=[#7])-c:3:c-2:c:c:c:c:3","<regId=het_666_A(5)>"
-"c:1:c(:c:c:c:c:1)-[#7]-2-[#6](-[#1])-[#6](-[#1])-[#7](-[#6](-[#1])-[#6]-2-[#1])-[#16](=[#8])(=[#8])-c:3:c:c:c:c:4:n:s:n:c:3:4","<regId=diazox_sulfon_B(5)>"
-"c:1(:c(:c-,:2:c(:c(:c:1-[#1])-[#1])-,:[#7](-,:[#6](-,:[#7]-,:2-[#1])=[#8])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=anil_NH_alk_A(5)>"
-"c:1(:c(:c-3:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-c:2:c:c:c(:c:c:2)-[!#6&!#1])-[#1])-[#8]-[#6](-[#8]-3)(-[#1])-[#1])-[#1])-[#1]","<regId=sulfonamide_C(5)>"
-"[#6](-[#1])-[#6]:2:[#7]:[#7](-c:1:c:c:c:c:c:1):[#16]:3:[!#6&!#1]:[!#1]:[#6]:[#6]:2:3","<regId=het_thio_N_55(5)>"
-"[#8]=[#6]-[#6]=[#6](-[#1])-[#8]-[#1]","<regId=keto_keto_beta_D(5)>"
-"[#7]-,:1-,:2-,:[#6](=,:[#7]-,:[#6](=[#8])-,:[#6](=,:[#7]-,:1)-[#6](-[#1])-[#1])-,:[#16]-,:[#6](=[#6](-[#1])-[#6]:[#6])-,:[#6]-,:2=[#8]","<regId=ene_rhod_H(5)>"
-"[#6]:[#6]-[#6](-[#1])=[#6](-[#1])-[#6](-[#1])=[#7]-[#7](-[#6;X4])-[#6;X4]","<regId=imine_ene_A(5)>"
-"c:1:3:c(:c:c:c:c:1):c:2:n:n:c(-[#16]-[#6](-[#1])(-[#1])-[#6]=[#8]):n:c:2:n:3-[#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]","<regId=het_thio_656a(5)>"
-"n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#1])-[#1])-[#1]","<regId=pyrrole_D(5)>"
-"n2(-[#6]:1:[!#1]:[!#6&!#1]:[!#1]:[#6]:1-[#1])c(c(-[#1])c(c2-[#6;X4])-[#1])-[#6;X4]","<regId=pyrrole_E(5)>"
-"c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6]([#7;R])[#7;R]","<regId=thio_urea_G(5)>"
-"c:1(:c(:c(:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1])]:1)-[#1])-[#8]-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[$([#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]),$([#6](-[#1])(-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#16])-[#7]-[#1])])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=anisol_A(5)>"
-"n2(-[#6]:1:[#6](-[#6]#[#7]):[#6]:[#6]:[!#6&!#1]:1)c(c(-[#1])c(c2)-[#1])-[#1]","<regId=pyrrole_F(5)>"
-"[#7](-[#1])(-[#1])-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-c:1:c(:c:c:s:1)-[#8]-2","<regId=dhp_amino_CN_D(5)>"
-"[#7](-[#1])-c:1:n:c(:c:s:1)-c:2:c:n:c(-[#7](-[#1])-[#1]):s:2","<regId=thiazole_amine_A(4)>"
-"[#7]=[#6]-1-[#7](-[#1])-[#6](=[#6](-[#7]-[#1])-[#7]=[#7]-1)-[#7]-[#1]","<regId=het_6_imidate_A(4)>"
-"c:1:c(:c:2:c(:c:c:1):c:c:c:c:2)-[#8]-c:3:c(:c(:c(:c(:c:3-[#1])-[#1])-[#7]-[#1])-[#1])-[#1]","<regId=anil_OC_no_alk_B(4)>"
-"c:1:c:c-2:c(:c:c:1)-[#6]-[#16]-c3c(-[#6]-2=[#6])ccs3","<regId=styrene_C(4)>"
-"c:2:c:c:c:1:c(:c:c:c:1):c:c:2","<regId=azulene(4)>"
-"c:1(:c(:c(:c(:o:1)-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6]:[#6])-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=furan_acid_A(4)>"
-"[!#1]:[#6]-[#6]-,:1=,:[#6](-[#1])-,:[#6](=,:[#6](-[#6]#[#7])-,:[#6](=[#8])-,:[#7]-,:1-[#1])-[#6]:[#8]","<regId=cyano_pyridone_E(4)>"
-"[#6]-1-3=[#6](-[#6](-[#7]-c:2:c:c:c:c:c-1:2)(-[#6])-[#6])-[#16]-[#16]-[#6]-3=[!#1]","<regId=anil_alk_thio(4)>"
-"c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#6](=[#8])-c:2:c:c:c:c:c:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_I(4)>"
-"[#6](-[#1])(-[#1])-[#16;X2]-c:1:n:n:c(:c(:n:1)-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-c:3:c(:c(:c(:o:3)-[#1])-[#1])-[#1]","<regId=het_thio_6_furan(4)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2=[#6]-c:1:c(:c:c:c:c:1)-[#6]-2(-[#1])-[#1]","<regId=anil_di_alk_ene_B(4)>"
-"[#7](-[#1])(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6](=[#8])-[#6](-[#1])-[#1])-[#7](-[#1])-[$([#7]-[#1]),$([#6]:[#6])]","<regId=imine_one_B(4)>"
-"c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1]):o:c:3:c(-[#1]):c(:c(-[#8]-[#6](-[#1])-[#1]):c(:c:2:3)-[#1])-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=anil_OC_alk_A(4)>"
-"[#16]=[#6]-,:1-,:[#7](-[#1])-,:[#6]=,:[#6]-,:[#6]-2=,:[#6]-,:1-[#6](=[#8])-[#8]-[#6]-2=[#6]-[#1]","<regId=ene_five_het_J(4)>"
-"n2(-c:1:c(:c:c(:c(:c:1)-[#1])-[$([#7](-[#1])-[#1]),$([#6]:[#7])])-[#1])c(c(-[#1])c(c2-[#1])-[#1])-[#1]","<regId=pyrrole_G(4)>"
-"n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])=[#6]-2-[#6](=[#8])-[!#6&!#1]-[#6]=,:[!#1]-2)-[#1])-[#1]","<regId=ene_five_het_K(4)>"
-"[#6]=[#6]-[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[#6]#[#7])=[#6]-[#7](-[#1])-[#1]","<regId=cyano_ene_amine_B(4)>"
-"[#6]:[#6]-[#6](=[#16;X1])-[#16;X2]-[#6](-[#1])-[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thio_ester_B(4)>"
-"[#8]=[#6]-3-[#6](=!@[#6](-[#1])-c:1:c:n:c:c:1)-c:2:c:c:c:c:c:2-[#7]-3","<regId=ene_five_het_L(4)>"
-"c:1(:[c;!H0,$(c-[#6;!H0;!H1])](:c(:c(:s:1)-[#1])-[#1]))-[#6](-[#1])=[#7]-[#7](-[#1])-c:2:c:c:c:c:c:2","<regId=hzone_thiophene_B(4)>"
-"[#6](-[#1])(-[#1])-[#16;X2]-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](-[#6]#[#7])-[#6](=[#8])-[#7]-1","<regId=dhp_amino_CN_E(4)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#7](-[#1])-[#6]=[#8])-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=het_5_B(4)>"
-"[#6]:[#6]-[#6](-[#1])=[#6](-[#1])-[#6](-[#1])=[#7]-[#7]=[#6]","<regId=imine_imine_B(3)>"
-"c:1(:c:c:c(:c:c:1)-[#6](-[#1])-[#1])-c:2:c(:s:c(:n:2)-[#7](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1]","<regId=thiazole_amine_B(3)>"
-"[#6]-2(-[#6]=[#7]-c:1:c:c:c:c:c:1-[#7]-2)=[#6](-[#1])-[#6]=[#8]","<regId=imine_ene_one_A(3)>"
-"[#8](-c:1:c:c:c:c:c:1)-c:3:c:c:2:n:o:n:c:2:c:c:3","<regId=diazox_A(3)>"
-"[!#1]:1:[!#1]:[!#1]:[!#1](:[!#1]:[!#1]:1)-[#6](-[#1])=[#6](-[#1])-[#6](-[#7]-c:2:c:c:c:3:c(:c:2):c:c:c(:n:3)-[#7](-[#6])-[#6])=[#8]","<regId=ene_one_A(3)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c:c:c:n:1)-[#8]-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=anil_OC_no_alk_C(3)>"
-"[#6]-[#16;X2]-c:1:n:c(:c:s:1)-[#1]","<regId=thiazol_SC_A(3)>"
-"c:1:c-3:c(:c:c:c:1)-[#7](-c:2:c:c:c:c:c:2-[#8]-3)-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=het_666_B(3)>"
-"c:1(:c(:c(:c(:o:1)-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#8]-[#1])-[#6]#[#6]-[#6;X4]","<regId=furan_A(3)>"
-"[#6]-1(-[#6](=[#6]-[#6]=[#6]-[#6]=[#6]-1)-[#7]-[#1])=[#7]-[#6]","<regId=colchicine_A(3)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])=[#6]-[#6](=[#8])-c:1:c(-[#16;X2]):s:c(:c:1)-[$([#6]#[#7]),$([#6]=[#8])]","<regId=thiophene_C(3)>"
-"c:1:3:c(:c:c:c:c:1)-[#7]-2-[#6](=[#8])-[#6](=[#6](-[F,Cl,Br,I])-[#6]-2=[#8])-[#7](-[#1])-[#6]:[#6]:[#6]:[#6](-[#8]-[#6](-[#1])-[#1]):[#6]:[#6]:3","<regId=anil_OC_alk_B(3)>"
-"c:1-2:c(:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]=[#6]-2-[#16;X2]-[#6](-[#1])(-[#1])-[#6](=[#8])-c:3:c:c:c:c:c:3","<regId=het_thio_66_A(3)>"
-"[#7]-2(-c:1:c:c:c:c:c:1-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#6](-[#1])(-[#1])-[!#1]:[!#1]:[!#1]:[!#1]:[!#1])-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=rhod_sat_B(3)>"
-"[#7]-2(-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](=[#6](-[#1])-c:1:c:c:c:c(:c:1)-[Br])-[#6]-2=[#8]","<regId=ene_rhod_I(3)>"
-"c:1(:c(:c:2:c(:s:1):c:c:c:c:2)-[#6](-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=keto_thiophene(3)>"
-"[#7](-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])=[#7]-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#6](-[#1])-[#1])-[#6]:[#6]","<regId=imine_imine_C(3)>"
-"[#6]:2(:[#6](-[#6](-[#1])-[#1]):[#6]-,:1:[#6](-,:[#7]=,:[#6;!H0,$([#6]-[#16]-[#6](-[#1])-[#1])](-,:[#7](-,:[#6]-,:1=[!#6&!#1;X1])-[#6](-[#1])-[$([#6](=[#8])-[#8]),$([#6]:[#6])])):[!#6&!#1;X2]:2)-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=het_65_pyridone_A(3)>"
-"c:1(:n:c(:c(-[#1]):s:1)-[!#1]:[!#1]:[!#1](-[$([#8]-[#6](-[#1])-[#1]),$([#6](-[#1])-[#1])]):[!#1]:[!#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c(-[#1]):c(:c(-[#1]):o:2)-[#1]","<regId=thiazole_amine_C(3)>"
-"n:1:c(:c(:c(:c(:c:1-[#16]-[#6]-[#1])-[#6]#[#7])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])-[#1])-[#1])-[#6]:[#6]","<regId=het_thio_pyr_A(3)>"
-"c:1:4:c(:n:c(:n:c:1-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-[#7](-[#1])-c:3:c:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])](:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])](:c:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#16;X2]),$(c-[#8]-[#6]-[#1]),$(c-[#7;X3])]:3))):c:c:c:c:4","<regId=melamine_A(3)>"
-"[#7](-[#1])(-[#6]:1:[#6]:[#6]:[!#1]:[#6]:[#6]:1)-c:2:c:c:c(:c:c:2)-[#7](-[#1])-[#6]-[#1]","<regId=anil_NH_alk_B(3)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#7]-[#6]=[#8])-[#16]-[#6](-[#1])(-[#1])-[#6]-2=[#8]","<regId=rhod_sat_C(3)>"
-"[#6]=[#6]-[#6](=[#8])-[#7]-c:1:c(:c(:c(:s:1)-[#6](=[#8])-[#8])-[#6]-[#1])-[#6]#[#7]","<regId=thiophene_amino_D(3)>"
-"[#8;!H0,$([#8]-[#6](-[#1])-[#1])]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:n:c:c:n:2","<regId=anil_OC_alk_C(3)>"
-"[#6](-[#1])(-[#1])-[#16;X2]-c3nc1c(n(nc1-[#6](-[#1])-[#1])-c:2:c:c:c:c:c:2)nn3","<regId=het_thio_65_A(3)>"
-"[#6]-[#6](=[#8])-[#6](-[#1])(-[#1])-[#16;X2]-c:3:n:n:c:2:c:1:c(:c(:c(:c(:c:1:n(:c:2:n:3)-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=het_thio_656b(3)>"
-"s:1:c(:[n+](-[#6](-[#1])-[#1]):c(:c:1-[#1])-[#6])-[#7](-[#1])-c:2:c:c:c:c:c:2[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thiazole_amine_D(3)>"
-"[#6]-,:2(=[#16])-,:[#7](-[#6](-[#1])(-[#1])-c:1:c:c:c:o:1)-,:[#6](=,:[#7]-,:[#7]-,:2-[#1])-[#6]:[#6]","<regId=thio_urea_H(3)>"
-"[#7]-,:2(-c:1:c:c:c:c:c:1)-,:[#6](=[#8])-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:2)-[#6]#[#7])-[#6]#[#7]","<regId=cyano_pyridone_F(3)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6])-[#6]-2=[#8]","<regId=rhod_sat_D(3)>"
-"[#6](-[#1])(-[#1])-[#7]-2-[#6](=[$([#16]),$([#7])])-[!#6&!#1]-[#6](=[#6]-1-[#6](=[#6](-[#1])-[#6]:[#6]-[#7]-1-[#6](-[#1])-[#1])-[#1])-[#6]-2=[#8]","<regId=ene_rhod_J(3)>"
-"[#6]=[#7;!R]-c:1:c:c:c:c:c:1-[#8]-[#1]","<regId=imine_phenol_A(3)>"
-"[#8]=[#6]-,:2-,:[#16]-,:c:1:c(:c(:c:c:c:1)-[#8]-[#6](-[#1])-[#1])-,:[#8]-,:2","<regId=thio_carbonate_B(3)>"
-"[#7]=,:[#6]-,:1-,:[#7]=,:[#6]-,:[#7]-,:[#16]-,:1","<regId=het_thio_N_5A(3)>"
-"[#7]-,:2-,:[#16]-,:[#6]-1=,:[#6](-[#6]:[#6]-[#7]-[#6]-1)-,:[#6]-,:2=[#16]","<regId=het_thio_N_65A(3)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])=[#7]-[#7]=[#6](-[#6])-[#6]:[#6])-[#1])-[#1]","<regId=anil_di_alk_J(3)>"
-"n1-2cccc1-[#6]=[#7](-[#6])-[#6]-[#6]-2","<regId=pyrrole_H(3)>"
-"[#6](-[#6]#[#7])(-[#6]#[#7])=[#6](-[#16])-[#16]","<regId=ene_cyano_D(3)>"
-"[#6]-1(-[#6]#[#7])(-[#6]#[#7])-[#6](-[#1])(-[#6](=[#8])-[#6])-[#6]-1-[#1]","<regId=cyano_cyano_B(3)>"
-"[#6]-1=,:[#6]-[#6](-[#6](-[$([#8]),$([#16])]-1)=[#6]-[#6]=[#8])=[#8]","<regId=ene_five_het_M(3)>"
-"[#6]:[#6]-[#6](=[#8])-[#7](-[#1])-[#6](=[#8])-[#6](-[#6]#[#7])=[#6](-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=cyano_ene_amine_C(3)>"
-"c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#7]=[#6]-c:2:c:n:c:c:2","<regId=thio_urea_I(3)>"
-"[#7](-[#1])(-[#1])-[#6]-2=[#6](-[#6]#[#7])-[#6](-[#1])(-c:1:c:c:c:s:1)-[#6](=[#6](-[#6](-[#1])-[#1])-[#8]-2)-[#6](=[#8])-[#8]-[#6]","<regId=dhp_amino_CN_F(3)>"
-"c:1:c-3:c(:c:c(:c:1)-[#6](=[#8])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#6](=[#8])-[#8]-[#1])-[#6](-[#7](-[#6]-3=[#8])-[#6](-[#1])-[#1])=[#8]","<regId=anthranil_acid_B(3)>"
-"[Cl]-c:2:c:c:1:n:o:n:c:1:c:c:2","<regId=diazox_B(3)>"
-"[#6]-[#6](=[#16])-[#1]","<regId=thio_aldehyd_A(3)>"
-"[#6;X4]-[#7](-[#1])-[#6](-[#6]:[#6])=[#6](-[#1])-[#6](=[#16])-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=thio_amide_B(2)>"
-"[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#16]-[#6](-[#1])(-[#1])-c1cn(cn1)-[#1]","<regId=imidazole_B(2)>"
-"[#8]=[#6]-[#7](-[#1])-c:1:c(-[#6]:[#6]):n:c(-[#6](-[#1])(-[#1])-[#6]#[#7]):s:1","<regId=thiazole_amine_E(2)>"
-"[#6](-[#1])-[#7](-[#1])-c:1:n:c(:c:s:1)-c2cnc3n2ccs3","<regId=thiazole_amine_F(2)>"
-"[#7]-,:1-,:[#6](=[#8])-,:[#6](=,:[#6](-[#6])-,:[#16]-,:[#6]-,:1=[#16])-[#1]","<regId=thio_ester_C(2)>"
-"[#6](-[#16])(-[#7])=[#6](-[#1])-[#6]=[#6](-[#1])-[#6]=[#8]","<regId=ene_one_B(2)>"
-"[#8]=[#6]-3-c:1:c(:c:c:c:c:1)-[#6]-2=[#6](-[#8]-[#1])-[#6](=[#8])-[#7]-c:4:c-2:c-3:c:c:c:4","<regId=quinone_C(2)>"
-"c:1:2:c:c:c:c(:c:1:c(:c:c:c:2)-[$([#8]-[#1]),$([#7](-[#1])-[#1])])-[#6](-[#6])=[#8]","<regId=keto_naphthol_A(2)>"
-"[#6](-[#1])(-c:1:c:c:c:c:c:1)(-c:2:c:c:c:c:c:2)-[#6](=[#16])-[#7]-[#1]","<regId=thio_amide_C(2)>"
-"[#7]-2(-[#6](=[#8])-c:1:c(:c(:c(:c(:c:1-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1])-[#6]-2=[#8])-c:3:c(:c:c(:c(:c:3)-[#1])-[#8])-[#1]","<regId=phthalimide_misc(2)>"
-"c:1:c:c(:c:c:c:1-[#7](-[#1])-[#16](=[#8])=[#8])-[#7](-[#1])-[#16](=[#8])=[#8]","<regId=sulfonamide_D(2)>"
-"[#6](-[#1])-[#7](-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6]-[#1]","<regId=anil_NH_alk_C(2)>"
-"s1c(c(c-,:2c1-,:[#7](-[#1])-,:[#6](-,:[#6](=,:[#6]-,:2-[#1])-[#6](=[#8])-[#8]-[#1])=[#8])-[#7](-[#1])-[#1])-[#6](=[#8])-[#7]-[#1]","<regId=het_65_E(2)>"
-"c:2(:c:1:c(:c(:c(:c(:c:1:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#7](-[#1])-[#6]=[#8])-[#1])-[#1])-[#1]","<regId=hzide_naphth(2)>"
-"[#6](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6;X4])-[#1]","<regId=anisol_B(2)>"
-"[#6]-1=[#6]-[#7]-[#6](-[#16]-[#6;X4]-1)=[#16]","<regId=thio_carbam_ene(2)>"
-"[#6](-[#7](-[#6]-[#1])-[#6]-[#1]):[#6]-[#7](-[#1])-[#6](=[#16])-[#6]-[#1]","<regId=thio_amide_D(2)>"
-"n2nc(c1cccc1c2-[#6])-[#6]","<regId=het_65_Da(2)>"
-"s:1:c(:c(-[#1]):c(:c:1-[#6](=[#8])-[#7](-[#1])-[#7]-[#1])-[#8]-[#6](-[#1])-[#1])-[#1]","<regId=thiophene_D(2)>"
-"[#6]-1:[#6]-[#7]=[#6]-[#6](=[#6]-[#7]-[#6])-[#16]-1","<regId=het_thio_6_ene(2)>"
-"[#6](-[#1])(-[#1])-[#6](-[#1])(-[#6]#[#7])-[#6](=[#8])-[#6]","<regId=cyano_keto_A(2)>"
-"c2(c(-[#7](-[#1])-[#1])n(-c:1:c:c:c:c:c:1-[#6](=[#8])-[#8]-[#1])nc2-[#6]=[#8])-[$([#6]#[#7]),$([#6]=[#16])]","<regId=anthranil_acid_C(2)>"
-"c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7](-,:[#7]=,:[#6]-,:3)-[#1]","<regId=naphth_amino_C(2)>"
-"c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7]-,:[#7]=,:[#7]-,:3","<regId=naphth_amino_D(2)>"
-"c1csc(n1)-[#7]-[#7]-[#16](=[#8])=[#8]","<regId=thiazole_amine_G(2)>"
-"c:1:c:c:c:2:c(:c:1):n:c(:n:c:2)-[#7](-[#1])-[#6]-3=[#7]-[#6](-[#6]=[#6]-[#7]-3-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=het_66_B(2)>"
-"c:1-,:3:c(:c(:c(:c(:c:1)-[#8]-[#6]-[#1])-[#1])-[#1])-,:c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-,:[#6](=[#8])-,:[#8]-,:3","<regId=coumarin_A(2)>"
-"c:12:c(:c:c:c:n:1)c(c(-[#6](=[#8])~[#8;X1])s2)-[#7](-[#1])-[#1]","<regId=anthranil_acid_D(2)>"
-"c:1:2:n:c(:c(:n:c:1:[#6]:[#6]:[#6]:[!#1]:2)-[#6](-[#1])=[#6](-[#8]-[#1])-[#6])-[#6](-[#1])=[#6](-[#8]-[#1])-[#6]","<regId=het_66_C(2)>"
-"c1csc(c1-[#7](-[#1])-[#1])-[#6](-[#1])=[#6](-[#1])-c2cccs2","<regId=thiophene_amino_E(2)>"
-"c:2:c:c:1:n:c:3:c(:n:c:1:c:c:2):c:c:c:4:c:3:c:c:c:c:4","<regId=het_6666_A(2)>"
-"[#6]:[#6]-[#7](-[#1])-[#16](=[#8])(=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=sulfonamide_E(2)>"
-"c:1:c:c(:c:c:c:1-[#7](-[#1])-[#1])-[#7](-[#6;X3])-[#6;X3]","<regId=anil_di_alk_K(2)>"
-"[#7]-2=[#6](-c:1:c:c:c:c:c:1)-[#6](-[#1])(-[#1])-[#6](-[#8]-[#1])(-[#6](-[#9])(-[#9])-[#9])-[#7]-2-[$([#6]:[#6]:[#6]:[#6]:[#6]:[#6]),$([#6](=[#16])-[#6]:[#6]:[#6]:[#6]:[#6]:[#6])]","<regId=het_5_C(2)>"
-"c:1:c(:c:c:c:c:1)-[#6](=[#8])-[#6](-[#1])=[#6]-,:3-,:[#6](=[#8])-,:[#7](-[#1])-,:[#6](=[#8])-,:[#6](=[#6](-[#1])-c:2:c:c:c:c:c:2)-,:[#7]-,:3-[#1]","<regId=ene_six_het_B(2)>"
-"[#8]=[#6]-4-[#6]-[#6]-[#6]-3-[#6]-2-[#6](=[#8])-[#6]-[#6]-1-[#6]-[#6]-[#6]-[#6]-1-[#6]-2-[#6]-[#6]-[#6]-3=[#6]-4","<regId=steroid_A(2)>"
-"c:1:2:c:3:c(:c(-[#8]-[#1]):c(:c:1:c(:c:n:2-[#6])-[#6]=[#8])-[#1]):n:c:n:3","<regId=het_565_A(2)>"
-"[#6;X4]-[#7+](-[#6;X4]-[#8]-[#1])=[#6]-[#16]-[#6]-[#1]","<regId=thio_imine_ium(2)>"
-"[#6]-3(=[#8])-[#6](=[#6](-[#1])-[#7](-[#1])-c:1:c:c:c:c:c:1-[#6](=[#8])-[#8]-[#1])-[#7]=[#6](-c:2:c:c:c:c:c:2)-[#8]-3","<regId=anthranil_acid_E(2)>"
-"c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]=[#7]-[#7](-[#1])-c:2:c:c:n:c:c:2","<regId=hzone_furan_B(2)>"
-"c:1(:c(:c(:[c;!H0,$(c-[#6;!H0,!H1])](:s:1))-[#1])-[#1])-[#6;!H0,$([#6]-[#6;!H0;!H1])]-[#6](=[#8])-[#7](-[#1])-c:2:n:c:c:s:2","<regId=thiophene_E(2)>"
-"[#6]:[#6]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#6]=[#8])-[#7]-2-[#6](=[#8])-[#6]-1(-[#1])-[#6](-[#1])(-[#1])-[#6]=[#6]-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#6]-2=[#8]","<regId=ene_misc_B(2)>"
-"[#6]-1(-[#6]=[#8])(-[#6]:[#6])-[#16;X2]-[#6]=[#7]-[#7]-1-[#1]","<regId=het_thio_5_B(2)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-c:2:c:c:c:c:c:2)-[#6]#[#7])-[#6]:3:[!#1]:[!#1]:[!#1]:[!#1]:[!#1]:3","<regId=thiophene_amino_F(2)>"
-"[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2-[$([#6](-[#1])-[#1]),$([#8]-[#6](-[#1])-[#1])]","<regId=anil_OC_alk_D(2)>"
-"[#6](-[#1])(-[#1])(-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-c:1:c(:c:c(:c(:c:1-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6](-[#1])-[#7])-[#1]","<regId=tert_butyl_A(2)>"
-"c:1(:c(:o:c:c:1)-[#6]-[#1])-[#6]=[#7]-[#7](-[#1])-[#6](=[#16])-[#7]-[#1]","<regId=thio_urea_J(2)>"
-"[#7](-[#1])-c1nc(nc2nnc(n12)-[#16]-[#6])-[#7](-[#1])-[#6]","<regId=het_thio_65_B(2)>"
-"c:1-,:2:c(:c:c:c:c:1-[#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1])-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-[#6]:[#6])-,:[#6](=[#8])-,:[#8]-,:2)-[#1]","<regId=coumarin_B(2)>"
-"[#6]-2(=[#16])-[#7]-1-[#6]:[#6]-[#7]=[#7]-[#6]-1=[#7]-[#7]-2-[#1]","<regId=thio_urea_K(2)>"
-"[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-c:1:c:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6])-[#6](=[#8])-[#8]-[#1]","<regId=thiophene_amino_G(2)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c:c:1-[#7](-[#1])-[#6](-[#1])(-[#6])-[#6](-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_NH_alk_D(2)>"
-"[#16]=[#6]-,:2-,:[#7](-[#1])-,:[#7]=,:[#6](-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-,:[#8]-,:2","<regId=het_thio_5_C(2)>"
-"[#16]=[#6]-c:1:c:c:c:2:c:c:c:c:n:1:2","<regId=thio_keto_het(2)>"
-"[#6]~1~[#6](~[#7]~[#7]~[#6](~[#6](-[#1])-[#1])~[#6](-[#1])-[#1])~[#7]~[#16]~[#6]~1","<regId=het_thio_N_5B(2)>"
-"[#6]-1(-[#6]=,:[#6]-[#6]=,:[#6]-[#6]-1=[!#6&!#1])=[!#6&!#1]","<regId=quinone_D(2)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(-[#1]):c(:c(:o:1)-[#6](-[#1])=[#6]-[#6]#[#7])-[#1]","<regId=anil_di_alk_furan_B(2)>"
-"[#8]=[#6]-1-[#6]:[#6]-[#6](-[#1])(-[#1])-[#7]-[#6]-1=[#6]-[#1]","<regId=ene_six_het_C(2)>"
-"[#6]:[#6]-[#7]:2:[#7]:[#6]:1-[#6](-[#1])(-[#1])-[#16;X2]-[#6](-[#1])(-[#1])-[#6]:1:[#6]:2-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])=[#6]-[#1]","<regId=het_55_A(2)>"
-"n:1:c(:n(:c:2:c:1:c:c:c:c:2)-[#6](-[#1])-[#1])-[#16]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6](-[#1])-[#6](-[#1])=[#6]-[#1]","<regId=het_thio_65_C(2)>"
-"c:1(:c:c(:c(:c:c:1)-[#8]-[#1])-[#6](=!@[#6]-[#7])-[#6]=[#8])-[#8]-[#1]","<regId=hydroquin_A(2)>"
-"c:1(:c:c(:c(:c:c:1)-[#7](-[#1])-[#6](=[#8])-[#6]:[#6])-[#6](=[#8])-[#8]-[#1])-[#8]-[#1]","<regId=anthranil_acid_F(2)>"
-"n2(-[#6](-[#1])-[#1])c-1c(-[#6]:[#6]-[#6]-1=[#8])cc2-[#6](-[#1])-[#1]","<regId=pyrrole_I(2)>"
-"[#6](-[#1])-[#7](-[#1])-c:1:c(:c(:c(:s:1)-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=thiophene_amino_H(2)>"
-"[#6]:[#6]-[#7;!R]=[#6]-2-[#6](=[!#6&!#1])-c:1:c:c:c:c:c:1-[#7]-2","<regId=imine_one_fives_C(2)>"
-"c:1:c:c:c:c:c:1-[#6](=[#8])-[#7](-[#1])-[#7]=[#6]-3-c:2:c:c:c:c:c:2-c:4:c:c:c:c:c-3:4","<regId=keto_phenone_zone_A(2)>"
-"c:1:c(:c:c:c:c:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])=[#6](-[#1])-[#6]=!@[#6](-[#1])-[#6](-[#1])=[#6]-[#6]=@[#7]-c:2:c:c:c:c:c:2","<regId=dyes7A(2)>"
-"[#6]:1:2:[!#1]:[#7+](:[!#1]:[#6;!H0,$([#6]-[*])](:[!#1]:1:[#6]:[#6]:[#6]:[#6]:2))~[#6]:[#6]","<regId=het_pyridiniums_B(2)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#16]-[#6])-[#6]-2=[#8]","<regId=het_5_D(2)>"
-"c:1:c:c:c(:c:c:1-[#7](-[#1])-c2nc(c(-[#1])s2)-c:3:c:c:c(:c:c:3)-[#6](-[#1])(-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=thiazole_amine_H(1)>"
-"[#6](-[#1])(-[#1])-[#7](-[#1])-[#6]=[#7]-[#7](-[#1])-c1nc(c(-[#1])s1)-[#6]:[#6]","<regId=thiazole_amine_I(1)>"
-"[#6]:[#6]-[#7](-[#1])-[#6](=[#8])-c1c(snn1)-[#7](-[#1])-[#6]:[#6]","<regId=het_thio_N_5C(1)>"
-"[#8]=[#16](=[#8])(-[#6]:[#6])-[#7](-[#1])-c1nc(cs1)-[#6]:[#6]","<regId=sulfonamide_F(1)>"
-"[#8]=[#16](=[#8])(-[#6]:[#6])-[#7](-[#1])-[#7](-[#1])-c1nc(cs1)-[#6]:[#6]","<regId=thiazole_amine_J(1)>"
-"s2c:1:n:c:n:c(:c:1c(c2-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7]-[#7]=[#6]-c3ccco3","<regId=het_65_F(1)>"
-"[#6](=[#8])-[#6](-[#1])=[#6](-[#8]-[#1])-[#6](-[#8]-[#1])=[#6](-[#1])-[#6](=[#8])-[#6]","<regId=keto_keto_beta_E(1)>"
-"c:2(:c:1-[#6](-[#6](-[#6](-c:1:c(:c(:c:2-[#1])-[#1])-[#1])(-[#1])-[#1])=[#8])=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]","<regId=ene_five_one_B(1)>"
-"[#6]:[#6]-[#7](-[#1])-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#1])-[#6]:[#6]","<regId=keto_keto_beta_zone(1)>"
-"[#6;X4]-[#16;X2]-[#6](=[#7]-[!#1]:[!#1]:[!#1]:[!#1])-[#7](-[#1])-[#7]=[#6]","<regId=thio_urea_L(1)>"
-"[#6]-1(=[#7]-[#7](-[#6](-[#16]-1)=[#6](-[#1])-[#6]:[#6])-[#6]:[#6])-[#6]=[#8]","<regId=het_thio_urea_ene(1)>"
-"c:1(:c(:c:2:c(:n:c:1-[#7](-[#1])-[#1]):c:c:c(:c:2-[#7](-[#1])-[#1])-[#6]#[#7])-[#6]#[#7])-[#6]#[#7]","<regId=cyano_amino_het_A(1)>"
-"[!#1]:1:[!#1]:[!#1]:[!#1](:[!#1]:[!#1]:1)-[#6](-[#1])=[#6](-[#1])-[#6](-[#7](-[#1])-[#7](-[#1])-c2nnnn2-[#6])=[#8]","<regId=tetrazole_hzide(1)>"
-"c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#1])-[#6](=[#7]-[#6]:[#6])-[#6](-[#1])-[#1])-[#8]-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=imine_naphthol_A(1)>"
-"c:1(:c(:c:2:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1]):c(:c(:c(:c:2-[#7](-[#1])-[#6](-[#1])(-[#1])-[#1])-[#1])-c:3:c(:c(:c(:c(:c:3-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=misc_anisole_A(1)>"
-"c:1:c:c-2:c(:c:c:1)-[#16]-c3c(-[#7]-2)cc(s3)-[#6](-[#1])-[#1]","<regId=het_thio_665(1)>"
-"c:1:c:c:c-2:c(:c:1)-[#6](-[#6](-[#7]-2-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-4-[#6](-c:3:c:c:c:c:c:3-[#6]-4=[#8])=[#8])(-[#1])-[#1])(-[#1])-[#1]","<regId=anil_di_alk_L(1)>"
-"c:1(:c:c:c(:c:c:1)-[#6]-,:3=,:[#6]-,:[#6](-,:c2cocc2-,:[#6](=,:[#6]-,:3)-[#8]-[#1])=[#8])-[#16]-[#6](-[#1])-[#1]","<regId=colchicine_B(1)>"
-"[#6;X4]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#16]-[#6](-[#1])(-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1]","<regId=misc_aminoacid_A(1)>"
-"n:1:c(:n(:c(:c:1-c:2:c:c:c:c:c:2)-c:3:c:c:c:c:c:3)-[#7]=!@[#6])-[#7](-[#1])-[#1]","<regId=imidazole_amino_A(1)>"
-"[#6](-c:1:c:c:c(:c:c:1)-[#8]-[#1])(-c:2:c:c:c(:c:c:2)-[#8]-[#1])-[#8]-[#16](=[#8])=[#8]","<regId=phenol_sulfite_A(1)>"
-"c:2:c:c:1:n:c(:c(:n:c:1:c:c:2)-[#6](-[#1])(-[#1])-[#6](=[#8])-[#6]:[#6])-[#6](-[#1])(-[#1])-[#6](=[#8])-[#6]:[#6]","<regId=het_66_D(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c(-[#6](-[#1])-[#1])c:c:2","<regId=misc_anisole_B(1)>"
-"[#6](-[#1])(-[#1])-c1nnnn1-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#1])-[#1]","<regId=tetrazole_A(1)>"
-"[#6]-2(=[#7]-c1c(c(nn1-[#6](-[#6]-2(-[#1])-[#1])=[#8])-[#7](-[#1])-[#1])-[#7](-[#1])-[#1])-[#6]","<regId=het_65_G(1)>"
-"[#6](-[#6]:[#6])(-[#6]:[#6])(-[#6]:[#6])-[#16]-[#6]:[#6]-[#6](=[#8])-[#8]-[#1]","<regId=misc_trityl_A(1)>"
-"[#8]=[#6](-c:1:c(:c(:n:c(:c:1-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=misc_pyridine_OC(1)>"
-"[#7]-1=[#6](-[#7](-[#6](-[#6](-[#6]-1(-[#1])-[#6]:[#6])(-[#1])-[#1])=[#8])-[#1])-[#7]-[#1]","<regId=het_6_hydropyridone(1)>"
-"[#6]-1(=[#6](-[#6](-[#6](-[#6](-[#6]-1(-[#1])-[#1])(-[#1])-[#6](=[#8])-[#6])(-[#1])-[#6](=[#8])-[#8]-[#1])(-[#1])-[#1])-[#6]:[#6])-[#6]:[#6]","<regId=misc_stilbene(1)>"
-"[#6](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[Cl])-[#1])-[#1])(-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[Cl])-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c3nc(c(n3-[#6](-[#1])(-[#1])-[#1])-[#1])-[#1]","<regId=misc_imidazole(1)>"
-"n:1:c(:c(:c(:c(:c:1-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=anil_NH_no_alk_A(1)>"
-"[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#8]-[#1])-[#6]-2=[#6](-[#8]-[#6](-[#7]=[#7]-2)=[#7])-[#7](-[#1])-[#1]","<regId=het_6_imidate_B(1)>"
-"[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=anil_alk_B(1)>"
-"c:1:c:c-3:c(:c:c:1)-c:2:c:c:c(:c:c:2-[#6]-3=[#6](-[#1])-[#6])-[#7](-[#1])-[#1]","<regId=styrene_anil_A(1)>"
-"c:1:c:c-2:c(:c:c:1)-[#7](-[#6](-[#8]-[#6]-2)(-[#6](=[#8])-[#8]-[#1])-[#6](-[#1])-[#1])-[#6](=[#8])-[#6](-[#1])-[#1]","<regId=misc_aminal_acid(1)>"
-"n:1:c(:c(:c(:c(:c:1-[#7](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_no_alk_D(1)>"
-"[#7](-[#1])(-c:1:c:c:c:c:c:1)-[#6](-[#6])(-[#6])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=anil_alk_C(1)>"
-"[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#1])-[#8]-[#6]-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])(-[#1])-[#1])-[#6]:[#6]","<regId=misc_anisole_C(1)>"
-"c:1-2:c:c-3:c(:c:c:1-[#8]-[#6]-[#8]-2)-[#6]-[#6]-3","<regId=het_465_misc(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#7](-[#1])-[#6]:[#6]","<regId=anthranil_acid_G(1)>"
-"c:1(:c:4:c(:n:c(:c:1-[#6](-[#1])(-[#1])-[#7]-3-c:2:c(:c(:c(:c(:c:2-[#6](-[#1])(-[#1])-[#6]-3(-[#1])-[#1])-[#1])-[#1])-[#1])-[#1])-[#1]):c(:c(:c(:c:4-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_M(1)>"
-"c:1:c(:c2:c(:c:c:1)c(c(n2-[#1])-[#6]:[#6])-[#6]:[#6])-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_H(1)>"
-"[#6]:[#6]-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-c:1:c(:c(:c(:c(:c:1-[F,Cl,Br,I])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_M(1)>"
-"n:1:c3:c(:c:c2:c:1nc(s2)-[#7])sc(n3)-[#7]","<regId=thiazole_amine_K(1)>"
-"[#7]=[#6]-1-[#16]-[#6](=[#7])-[#7]=[#6]-1","<regId=het_thio_5_imine_A(1)>"
-"c:1:c(:n:c:c:c:1)-[#6](=[#16])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#8]-[#6](-[#1])-[#1]","<regId=thio_amide_E(1)>"
-"c:1-2:c(:c(:c(:c(:c:1-[#6](-c:3:c(-[#16]-[#6]-2(-[#1])-[#1]):c(:c(-[#1]):c(:c:3-[#1])-[#1])-[#1])-[#8]-[#6]:[#6])-[#1])-[#1])-[#1])-[#1]","<regId=het_thio_676_B(1)>"
-"[#6](-[#1])(-[#1])(-[#1])-c:1:c(:c(:c(:c(:n:1)-[#7](-[#1])-[#16](-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])(=[#8])=[#8])-[#1])-[#1])-[#1]","<regId=sulfonamide_G(1)>"
-"[#6](=[#8])(-[#7]-1-[#6]-[#6]-[#16]-[#6]-[#6]-1)-c:2:c(:c(:c(:c(:c:2-[#16]-[#6](-[#1])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=thio_thiomorph_Z(1)>"
-"c:1:c:c:3:c:2:c(:c:1)-[#6](-[#6]=[#6](-c:2:c:c:c:3)-[#8]-[#6](-[#1])-[#1])=[#8]","<regId=naphth_ene_one_A(1)>"
-"c:1-3:c:2:c(:c(:c:c:1)-[#7]):c:c:c:c:2-[#6](-[#6]=[#6]-3-[#6](-[F])(-[F])-[F])=[#8]","<regId=naphth_ene_one_B(1)>"
-"c:1:c:c:c:c:2:c:1:c:c:3:c(:n:2):n:c:4:c(:c:3-[#7]):c:c:c:c:4","<regId=amino_acridine_A(1)>"
-"c:1:c-3:c(:c:c:c:1)-[#6]-2=[#7]-[!#1]=[#6]-[#6]-[#6]-2-[#6]-3=[#8]","<regId=keto_phenone_B(1)>"
-"c:1-3:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#7]-[#7](-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#6](=[#8])-[#8]-[#1])-[#1])-[#1])-c:4:c-3:c(:c(:c(:c:4-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#1]","<regId=hzone_acid_A(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1])-[#1])-[#16](=[#8])(=[#8])-[#7](-[#1])-c:2:n:n:c(:c(:c:2-[#1])-[#1])-[#1]","<regId=sulfonamide_H(1)>"
-"c2(c(-[#1])n(-[#6](-[#1])-[#1])c:3:c(:c(:c:1n(c(c(c:1:c2:3)-[#1])-[#1])-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1]","<regId=het_565_indole(1)>"
-"c1(c-2c(c(n1-[#6](-[#8])=[#8])-[#6](-[#1])-[#1])-[#16]-[#6](-[#1])(-[#1])-[#16]-2)-[#6](-[#1])-[#1]","<regId=pyrrole_J(1)>"
-"s1ccnc1-c2c(n(nc2-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=pyrazole_amino_B(1)>"
-"c1(c(c(c(n1-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=pyrrole_K(1)>"
-"c:1:2(:c(:c(:c(:o:1)-[#6])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6](-[#1]):[#6](-[#1]):[#6](-[#1]):[#6](-[#1]):[#6]:2-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_I(1)>"
-"[!#1]:[#6]-[#6](=[#16])-[#7](-[#1])-[#7](-[#1])-[#6]:[!#1]","<regId=thio_amide_F(1)>"
-"[#6]-1(=[#8])-[#6](-[#6](-[#6]#[#7])=[#6](-[#1])-[#7])-[#6](-[#7])-[#6]=[#6]-1","<regId=ene_one_C(1)>"
-"c2(c-1n(-[#6](-[#6]=[#6]-[#7]-1)=[#8])nc2-c3cccn3)-[#6]#[#7]","<regId=het_65_H(1)>"
-"[#8]=[#6]-1-[#6](=[#7]-[#7]-[#6]-[#6]-1)-[#6]#[#7]","<regId=cyano_imine_D(1)>"
-"c:2(:c:1:c:c:c:c:c:1:n:n:c:2)-[#6](-[#6]:[#6])-[#6]#[#7]","<regId=cyano_misc_A(1)>"
-"c:1:c:c-2:c(:c:c:1)-[#6]=[#6]-[#6](-[#7]-2-[#6](=[#8])-[#7](-[#1])-c:3:c:c(:c(:c:c:3)-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=ene_misc_C(1)>"
-"c:2:c:c:1:n:c(:c(:n:c:1:c:c:2)-c:3:c:c:c:c:c:3)-c:4:c:c:c:c:c:4-[#8]-[#1]","<regId=het_66_E(1)>"
-"[#6](-[#1])(-[#1])-[#6](-[#8]-[#1])=[#6](-[#6](=[#8])-[#6](-[#1])-[#1])-[#6](-[#1])-[#6]#[#6]","<regId=keto_keto_beta_F(1)>"
-"c:1:c:4:c(:c:c2:c:1nc(n2-[#1])-[#6]-[#8]-[#6](=[#8])-c:3:c:c(:c:c(:c:3)-[#7](-[#1])-[#1])-[#7](-[#1])-[#1]):c:c:c:c:4","<regId=misc_naphthimidazole(1)>"
-"c:2(:c:1:c:c:c:c-3:c:1:c(:c:c:2)-[#6]=[#6]-[#6]-3=[#7])-[#7]","<regId=naphth_ene_one_C(1)>"
-"c:2(:c:1:c:c:c:c:c:1:c-3:c(:c:2)-[#6](-c:4:c:c:c:c:c-3:4)=[#8])-[#8]-[#1]","<regId=keto_phenone_C(1)>"
-"[#6]-,:2(-,:[#6]=,:[#7]-,:c:1:c:c(:c:c:c:1-,:[#8]-,:2)-[Cl])=[#8]","<regId=coumarin_C(1)>"
-"[#6]-1=[#6]-[#7](-[#6](-c:2:c-1:c:c:c:c:2)(-[#6]#[#7])-[#6](=[#16])-[#16])-[#6]=[#8]","<regId=thio_est_cyano_A(1)>"
-"c2(nc:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])n2-[#6])-[#7](-[#1])-[#6](-[#7](-[#1])-c:3:c(:c:c:c:c:3-[#1])-[#1])=[#8]","<regId=het_65_imidazole(1)>"
-"[#7](-[#1])(-[#6]:[#6])-c:1:c(-[#6](=[#8])-[#8]-[#1]):c:c:c(:n:1)-,:[#6]:[#6]","<regId=anthranil_acid_J(1)>"
-"c:1-3:c(:c:c:c:c:1)-[#16]-[#6](=[#7]-[#7]=[#6]-2-[#6]=[#6]-[#6]=[#6]-[#6]=[#6]-2)-[#7]-3-[#6](-[#1])-[#1]","<regId=colchicine_het(1)>"
-"c:1-2:c(:c(:c(:c(:c:1-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](=[#6](-[#6])-[#16]-[#6]-2(-[#1])-[#1])-[#6]","<regId=ene_misc_D(1)>"
-"c:12:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])c(c(-[#6]:[#6])n2-!@[#6]:[#6])-[#6](-[#1])-[#1]","<regId=indole_3yl_alk_B(1)>"
-"[#7](-[#1])(-[#1])-c:1:c:c:c(:c:c:1-[#8]-[#1])-[#16](=[#8])(=[#8])-[#8]-[#1]","<regId=anil_OH_no_alk_A(1)>"
-"s:1:c:c:c(:c:1-[#1])-c:2:c:s:c(:n:2)-[#7](-[#1])-[#1]","<regId=thiazole_amine_L(1)>"
-"c1c(-[#7](-[#1])-[#1])nnc1-c2c(-[#6](-[#1])-[#1])oc(c2-[#1])-[#1]","<regId=pyrazole_amino_A(1)>"
-"n1nscc1-c2nc(no2)-[#6]:[#6]","<regId=het_thio_N_5D(1)>"
-"c:1(:c:c-3:c(:c:c:1)-[#7]-[#6]-4-c:2:c:c:c:c:c:2-[#6]-[#6]-3-4)-[#6;X4]","<regId=anil_alk_indane(1)>"
-"c:1-2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#6](-[#1])-[#6]-3-[#6](-[#6]#[#7])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#7]-2-3)-[#1]","<regId=anil_di_alk_N(1)>"
-"c:2-,:3:c(:c:c:1:c:c:c:c:c:1:c:2)-,:[#7](-[#6](-[#1])-[#1])-,:[#6](=[#8])-,:[#6](=,:[#7]-,:3)-[#6]:[#6]-[#7](-[#1])-[#6](-[#1])-[#1]","<regId=het_666_C(1)>"
-"[#6](-[#8]-[#1]):[#6]-[#6](=[#8])-[#6](-[#1])=[#6](-[#6])-[#6]","<regId=ene_one_D(1)>"
-"c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]):c(:c(-[#1]):n:2-[#1])-[#16](=[#8])=[#8]","<regId=anil_di_alk_indol(1)>"
-"c:1:2:c(:c(:c(:c(:c:1-[#1])-[#1])-[#7](-[#1])-[#1])-[#1]):c(:c(-[#1]):n:2-[#6](-[#1])-[#1])-[#1]","<regId=anil_no_alk_indol_A(1)>"
-"[#16;X2]-1-[#6]=[#6](-[#6]#[#7])-[#6](-[#6])(-[#6]=[#8])-[#6](=[#6]-1-[#7](-[#1])-[#1])-[$([#6]=[#8]),$([#6]#[#7])]","<regId=dhp_amino_CN_G(1)>"
-"[#7]-2-[#6]=[#6](-[#6]=[#8])-[#6](-c:1:c:c:c(:c:c:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#6]~3=,:[#6]-2~[#7]~[#6](~[#16])~[#7]~[#6]~3~[#7]","<regId=anil_di_alk_dhp(1)>"
-"c:1:c(:c:c:c:c:1)-[#6](=[#8])-[#7](-[#1])-c:2:c(:c:c:c:c:2)-[#6](=[#8])-[#7](-[#1])-[#7](-[#1])-c:3:n:c:c:s:3","<regId=anthranil_amide_A(1)>"
-"c:1:c:2:c(:c:c:c:1):c(:c:3:c(:c:2):c:c:c:c:3)-[#6]=[#7]-[#7](-[#1])-c:4:c:c:c:c:c:4","<regId=hzone_anthran_Z(1)>"
-"c:1:c(:c:c:c:c:1)-[#6](-[#1])-[#7]-[#6](=[#8])-[#6](-[#7](-[#1])-[#6](-[#1])-[#1])=[#6](-[#1])-[#6](=[#8])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])-[#1]","<regId=ene_one_amide_A(1)>"
-"s:1:c(:c(-[#1]):c(:c:1-[#6]-3=[#7]-c:2:c:c:c:c:c:2-[#6](=[#7]-[#7]-3-[#1])-c:4:c:c:n:c:c:4)-[#1])-[#1]","<regId=het_76_A(1)>"
-"o:1:c(:c(-[#1]):c(:c:1-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](=[#16])-[#7](-[#6]-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c:c:c:2)-[#1])-[#1]","<regId=thio_urea_N(1)>"
-"c:1:c(:c:c:c:c:1)-[#7](-[#6]-[#1])-[#6](-[#1])-[#6](-[#1])-[#6](-[#1])-[#7](-[#1])-[#6](=[#8])-[#6]-,:2=,:[#6](-,:[#8]-,:[#6](-,:[#6](=,:[#6]-,:2-[#6](-[#1])-[#1])-[#1])=[#8])-[#6](-[#1])-[#1]","<regId=anil_di_alk_coum(1)>"
-"c2-3:c:c:c:1:c:c:c:c:c:1:c2-[#6](-[#1])-[#6;X4]-[#7]-[#6]-3=[#6](-[#1])-[#6](=[#8])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=ene_one_amide_B(1)>"
-"c:1:c(:c:c:c:c:1)-[#6]-4=[#7]-[#7]:2:[#6](:[#7+]:c:3:c:2:c:c:c:c:3)-[#16]-[#6;X4]-4","<regId=het_thio_656c(1)>"
-"[#6]-2(=[#8])-[#6](=[#6](-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#7]=[#6](-c:1:c:c:c:c:c:1)-[#8]-2","<regId=het_5_ene(1)>"
-"c:1:c(:c:c:c:c:1)-[#7]-2-[#6](=[#8])-[#6](=[#6](-[#1])-[#6]-2=[#8])-[#16]-c:3:c:c:c:c:c:3","<regId=thio_imide_A(1)>"
-"[#7]-,:1(-[#1])-,:[#7]=,:[#6](-[#7]-[#1])-,:[#16]-,:[#6](=,:[#6]-,:1-,:[#6]:[#6])-,:[#6]:[#6]","<regId=dhp_amidine_A(1)>"
-"c:1(:c(:c-3:c(:c(:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])-c:2:c(:c(:c(:o:2)-[#6]-[#1])-[#1])-[#1])-[#1])-[#8]-[#6](-[#8]-3)(-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_O(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-c:2:c:c:c:c:c:2)-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_di_alk_O(1)>"
-"[#8]=[#6]-!@n:1:c:c:c-,:2:c:1-,:[#7](-[#1])-,:[#6](=[#16])-,:[#7]-,:2-[#1]","<regId=thio_urea_P(1)>"
-"[#6](-[F])(-[F])-[#6](=[#8])-[#7](-[#1])-c:1:c(-[#1]):n(-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6](-[#1])(-[#1])-[#6]:[#6]):n:c:1-[#1]","<regId=het_pyraz_misc(1)>"
-"[#7]-2=[#7]-[#6]:1:[#7]:[!#6&!#1]:[#7]:[#6]:1-[#7]=[#7]-[#6]:[#6]-2","<regId=diazox_C(1)>"
-"[#6]-2(-[#1])(-[#8]-[#1])-[#6]:1:[#7]:[!#6&!#1]:[#7]:[#6]:1-[#6](-[#1])(-[#8]-[#1])-[#6]=[#6]-2","<regId=diazox_D(1)>"
-"[#6]-1(-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#1])(-[#6](=[#8])-[#7](-[#1])-c:2:c:c:c(:c:c:2)-[#8]-[#6](-[#1])(-[#1])-[#8])-[#16](=[#8])(=[#8])-[#6]:[#6]","<regId=misc_cyclopropane(1)>"
-"[#6]-1:[#6]-[#6](=[#8])-[#6]=[#6]-1-[#7]=[#6](-[#1])-[#7](-[#6;X4])-[#6;X4]","<regId=imine_ene_one_B(1)>"
-"c:1:c:c(:c:c-,:2:c:1-,:[#6](=,:[#6](-[#1])-,:[#6](=[#8])-,:[#8]-,:2)-c:3:c:c:c:c:c:3)-[#8]-[#6](-[#1])(-[#1])-[#6]:[#8]:[#6]","<regId=coumarin_D(1)>"
-"c:1:c(:o:c(:c:1-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#8]-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-c:2:c:c-3:c(:c:c:2)-[#8]-[#6](-[#8]-3)(-[#1])-[#1]","<regId=misc_furan_A(1)>"
-"[#7]-4(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#7](-[#1])-c:2:c:c:c:c:3:c:c:c:c:c:2:3)-[#6]-4=[#8]","<regId=rhod_sat_E(1)>"
-"[#7]-3(-[#6](=[#8])-c:1:c:c:c:c:c:1)-[#6](=[#7]-c:2:c:c:c:c:c:2)-[#16]-[#6](-[#1])(-[#1])-[#6]-3=[#8]","<regId=rhod_sat_imine_A(1)>"
-"[#7]-2(-c:1:c:c:c:c:c:1)-[#6](=[#8])-[#16]-[#6](-[#1])(-[#1])-[#6]-2=[#16]","<regId=rhod_sat_F(1)>"
-"[#7]-1(-[#6](-[#1])-[#1])-[#6](=[#16])-[#7](-[#6]:[#6])-[#6](=[#7]-[#6]:[#6])-[#6]-1=[#7]-[#6]:[#6]","<regId=het_thio_5_imine_B(1)>"
-"[#16]-1-[#6](=[#7]-[#7]-[#1])-[#16]-[#6](=[#7]-[#6]:[#6])-[#6]-1=[#7]-[#6]:[#6]","<regId=het_thio_5_imine_C(1)>"
-"[#6]-2(=[#8])-[#6](=[#6](-[#1])-c:1:c(:c:c:c(:c:1)-[F,Cl,Br,I])-[#8]-[#6](-[#1])-[#1])-[#7]=[#6](-[#16]-[#6](-[#1])-[#1])-[#16]-2","<regId=ene_five_het_N(1)>"
-"[#6](-[#1])(-[#1])-[#16]-[#6](=[#16])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=thio_carbam_A(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6])-[#1])-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=misc_anilide_A(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#6](-[#1])-[#1])-[#1])-[Br])-[#1])-[#6](-[#1])-[#1])-[#7](-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1]","<regId=misc_anilide_B(1)>"
-"c:1-2:c(:c:c:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#7](-[#6]:[#6]-[#8]-[#6](-[#1])-[#1])-[#6]-2(-[#1])-[#1])-[#1])-[#1]","<regId=mannich_B(1)>"
-"c:1-2:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6]-2(-[#1])-[#1])-[#1])-[#8])-[#8])-[#1]","<regId=mannich_catechol_A(1)>"
-"[#7](-[#1])(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=anil_alk_D(1)>"
-"n:1:2:c:c:c(:c:c:1:c:c(:c:2-[#6](=[#8])-[#6]:[#6])-[#6]:[#6])-[#6](~[#8])~[#8]","<regId=het_65_I(1)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#6](=[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1])-[#6](-[#6;X4])(-[#6;X4])-[#7](-[#1])-[#6](=[#8])-[#7](-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]","<regId=misc_urea_A(1)>"
-"[#6]-3(-[#1])(-n:1:c(:n:c(:c:1-[#1])-[#1])-[#1])-c:2:c(:c(:c(:c(:c:2-[#1])-[Br])-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-c:4:c-3:c(:c(:c(:c:4-[#1])-[#1])-[#1])-[#1]","<regId=imidazole_C(1)>"
-"[#6](=[#6](-[#1])-[#6](-[#1])(-[#1])-n:1:c(:n:c(:c:1-[#1])-[#1])-[#1])(-[#6]:[#6])-[#6]:[#6]","<regId=styrene_imidazole_A(1)>"
-"c:1(:n:c(:c(-[#1]):s:1)-c:2:c:c:n:c:c:2)-[#7](-[#1])-[#6]:[#6]-[#6](-[#1])-[#1]","<regId=thiazole_amine_M(1)>"
-"c:1(:n:c(:c(-[#1]):s:1)-c:2:c:c:c:c:c:2)-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](-[#1])(-[#1])-c:3:c:c:c:n:3-[#1]","<regId=misc_pyrrole_thiaz(1)>"
-"n:1(-[#1]):c(:c(-[#6](-[#1])-[#1]):c(:c:1-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])-[#6](=[#8])-[#8]-[#6](-[#1])-[#1]","<regId=pyrrole_L(1)>"
-"c:2(:n:c:1:c(:c(:c:c(:c:1-[#1])-[F,Cl,Br,I])-[#1]):n:2-[#1])-[#16]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]","<regId=het_thio_65_D(1)>"
-"c:1(:c(:c-2:c(:c(:c:1-[#8]-[#6](-[#1])-[#1])-[#1])-[#6]=[#6]-[#6](-[#1])-[#16]-2)-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=ene_misc_E(1)>"
-"[#7]-1(-[#1])-[#6](=[#16])-[#6](-[#1])(-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6]-1-[#6]:[#6])-[#1]","<regId=thio_cyano_A(1)>"
-"n:1:c(:c(:c(:c(:c:1-[#16;X2]-c:2:c:c:c:c:c:2-[#7](-[#1])-[#1])-[#6]#[#7])-c:3:c:c:c:c:c:3)-[#6]#[#7])-[#7](-[#1])-[#1]","<regId=cyano_amino_het_B(1)>"
-"[#7]-,:2(-c:1:c:c:c(:c:c:1)-[#8]-[#6](-[#1])-[#1])-,:[#6](=[#8])-,:[#6](=,:[#6]-,:[#6](=,:[#7]-,:2)-n:3:c:n:c:c:3)-[#6]#[#7]","<regId=cyano_pyridone_G(1)>"
-"o:1:c(:c:c:2:c:1:c(:c(:c(:c:2-[#1])-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#6](~[#8])~[#8]","<regId=het_65_J(1)>"
-"[#6]#[#6]-[#6](=[#8])-[#6]#[#6]","<regId=ene_one_yne_A(1)>"
-"c:2(:c:1:c(:c(:c(:c(:c:1:c(:c(:c:2-[#8]-[#1])-[#6]=[#8])-[#1])-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_OH_no_alk_B(1)>"
-"c:1(:c(:c(:[c;!H0,$(c-[#6;!H0;!H1])](:o:1))-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6;!H0,$([#6]-[#6;!H0!H1])]-c:2:c:c:c:c(:c:2)-[*]-[*]-[*]-c:3:c:c:c:o:3","<regId=hzone_acyl_misc_A(1)>"
-"[#16](=[#8])(=[#8])-[#7](-[#1])-c:1:c(:c(:c(:s:1)-[#6]-[#1])-[#6]-[#1])-[#6](=[#8])-[#7]-[#1]","<regId=thiophene_F(1)>"
-"[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#8]-[#1])-[#6](-[#1])-[#1]","<regId=anil_OC_alk_E(1)>"
-"[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-[#6](-[#1])(-[#6]=[#8])-[#16]","<regId=anil_OC_alk_F(1)>"
-"n1nnnc2cccc12","<regId=het_65_K(1)>"
-"c:1-,:2:c(-[#1]):s:c(:c:1-,:[#6](=[#8])-,:[#7]-,:[#7]=,:[#6]-,:2-[#7](-[#1])-[#1])-[#6]=[#8]","<regId=het_65_L(1)>"
-"c:1-,:3:c(:c:2:c(:c:c:1-[Br]):o:c:c:2)-,:[#6](=,:[#6]-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_E(1)>"
-"c:1-,:3:c(:c:c:c:c:1)-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-c:2:n:o:c:c:2-[Br])-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_F(1)>"
-"c:1-,:2:c(:c:c(:c:c:1-[F,Cl,Br,I])-[F,Cl,Br,I])-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-[#1])-,:[#6](=[#7]-[#1])-,:[#8]-,:2)-[#1]","<regId=coumarin_G(1)>"
-"c:1-,:3:c(:c:c:c:c:1)-,:[#6](=,:[#6](-[#6](=[#8])-[#7](-[#1])-c:2:n:c(:c:s:2)-[#6]:[#16]:[#6]-[#1])-,:[#6](=[#8])-,:[#8]-,:3)-[#1]","<regId=coumarin_H(1)>"
-"[#6](-[#1])(-[#1])-[#16;X2]-c:2:n:n:c:1-[#6]:[#6]-[#7]=[#6]-[#8]-c:1:n:2","<regId=het_thio_67_A(1)>"
-"[#16](=[#8])(=[#8])(-c:1:c:n(-[#6](-[#1])-[#1]):c:n:1)-[#7](-[#1])-c:2:c:n(:n:c:2)-[#6](-[#1])(-[#1])-[#6]:[#6]-[#8]-[#6](-[#1])-[#1]","<regId=sulfonamide_I(1)>"
-"c:1-2:c(:c(:c(:c(:c:1-[#8]-[#6](-[#1])(-[#1])-[#8]-2)-[#6](-[#1])(-[#1])-[#7]-3-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#6]:[#6]-3)-[#1])-[#1])-[#1]","<regId=het_65_mannich(1)>"
-"[#6](-[#1])(-[#1])-[#8]-[#6]:[#6]-[#6](-[#1])(-[#1])-[#7](-[#1])-c:2:c(:c(:c:1:n(:c(:n:c:1:c:2-[#1])-[#1])-[#6]-[#1])-[#1])-[#1]","<regId=anil_alk_A(1)>"
-"[#7]-4(-c:1:c:c:c:c:c:1)-[#6](=[#7+](-c:2:c:c:c:c:c:2)-[#6](=[#7]-c:3:c:c:c:c:c:3)-[#7]-4)-[#1]","<regId=het_5_inium(1)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:2:c:c:c:1:s:c(:n:c:1:c:2)-[#16]-[#6](-[#1])-[#1]","<regId=anil_di_alk_P(1)>"
-"c:1:2:c(:c(:c(:c(:c:1:c(:c(-[#1]):c(:c:2-[#1])-[#1])-[#6](-[#6](-[#1])-[#1])=[#7]-[#7](-[#1])-[#6](=[#16])-[#7](-[#1])-[#6]:[#6]:[#6])-[#1])-[#1])-[#1])-[#1]","<regId=thio_urea_Q(1)>"
-"[#6]:1(:[#7]:[#6](:[#7]:[!#1]:[#7]:1)-c:2:c(:c(:c(:o:2)-[#1])-[#1])-[#1])-[#16]-[#6;X4]","<regId=thio_pyridine_A(1)>"
-"n:1:c(:n:c(:n:c:1-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#7](-[#6]-[#1])-[#6]=[#8]","<regId=melamine_B(1)>"
-"c:1(:n:s:c(:n:1)-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7]-[#6](=[#8])-c:2:c:c:c:c:c:2-[#6](=[#8])-[#8]-[#1])-c:3:c:c:c:c:c:3","<regId=misc_phthal_thio_N(1)>"
-"n:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6](-[#1])-c:2:c:c:c:c:c:2-[#8]-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]-[#1]","<regId=hzone_acyl_misc_B(1)>"
-"[#6](-[#1])(-[#1])(-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#8]-[#1])-[#6](-[#6](-[#1])(-[#1])-[#1])(-[#6](-[#1])(-[#1])-[#1])-[#6](-[#1])(-[#1])-[#1])-[#1])-[#6](-[#1])(-[#1])-c:2:c:c:c(:c(:c:2-[#1])-[#1])-[#8]-[#1])-[#1]","<regId=tert_butyl_B(1)>"
-"[#7](-[#1])(-[#1])-c:1:c(-[#7](-[#1])-[#1]):c(:c(-[#1]):c:2:n:o:n:c:1:2)-[#1]","<regId=diazox_E(1)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#7](-[#1])-[#16](=[#8])=[#8])-[#1])-[#7](-[#1])-[#6](-[#1])-[#1])-[F,Cl,Br,I])-[#1]","<regId=anil_NH_no_alk_B(1)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#7]=[#6]-2-[#6](=[#6]~[#6]~[#6]=[#6]-2)-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=anil_no_alk_A(1)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-n:2:c:c:c:c:2)-[#1])-[#6](-[#1])-[#1])-[#6](-[#1])-[#1])-[#1]","<regId=anil_no_alk_B(1)>"
-"[#16]=[#6]-[#6](-[#6](-[#1])-[#1])=[#6](-[#6](-[#1])-[#1])-[#7](-[#6](-[#1])-[#1])-[#6](-[#1])-[#1]","<regId=thio_ene_amine_A(1)>"
-"[#6]-1:[#6]-[#8]-[#6]-2-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]-[#6]-1-2","<regId=het_55_B(1)>"
-"[#8]-[#6](=[#8])-[#6](-[#1])(-[#1])-[#16;X2]-[#6](=[#7]-[#6]#[#7])-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=cyanamide_A(1)>"
-"[#8]=[#6]-[#6]-1=[#6](-[#16]-[#6](=[#6](-[#1])-[#6])-[#16]-1)-[#6]=[#8]","<regId=ene_one_one_A(1)>"
-"[#8]=[#6]-1-[#7]-[#7]-[#6](=[#7]-[#6]-1=[#6]-[#1])-[!#1]:[!#1]","<regId=ene_six_het_D(1)>"
-"[#8]=[#6]-[#6](-[#1])=[#6](-[#6]#[#7])-[#6]","<regId=ene_cyano_E(1)>"
-"[#8](-[#1])-[#6](=[#8])-c:1:c(:c(:c(:c(:c:1-[#8]-[#1])-[#1])-c:2:c(-[#1]):c(:c(:o:2)-[#6](-[#1])=[#6](-[#6]#[#7])-c:3:n:c:c:n:3)-[#1])-[#1])-[#1]","<regId=ene_cyano_F(1)>"
-"c:1:c(:c:c:c:c:1)-[#7](-c:2:c:c:c:c:c:2)-[#7]=[#6](-[#1])-[#6]:3:[#6](:[#6](:[#6](:[!#1]:3)-c:4:c:c:c:c(:c:4)-[#6](=[#8])-[#8]-[#1])-[#1])-[#1]","<regId=hzone_furan_C(1)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-c:2:c(-[#1]):c(:c(-[#6](-[#1])-[#1]):o:2)-[#6]=[#8])-[#1])-[#1]","<regId=anil_no_alk_C(1)>"
-"[#8](-[#1])-[#6](=[#8])-c:1:c:c:c(:c:c:1)-[#7]-[#7]=[#6](-[#1])-[#6]:2:[#6](:[#6](:[#6](:[!#1]:2)-c:3:c:c:c:c:c:3)-[#1])-[#1]","<regId=hzone_acid_D(1)>"
-"[#8](-[#1])-[#6](=[#8])-c:1:c:c:c:c(:c:1)-[#6]:[!#1]:[#6]-[#6]=[#7]-[#7](-[#1])-[#6](=[#8])-[#6](-[#1])(-[#1])-[#8]","<regId=hzone_furan_E(1)>"
-"[#8](-[#1])-[#6]:1:[#6](:[#6]:[!#1]:[#6](:[#7]:1)-[#7](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6](=[#8])-[#8]","<regId=het_6_pyridone_NH2(1)>"
-"[#6]-1(=[!#6&!#1])-[#6](-[#7]=[#6]-[#16]-1)=[#8]","<regId=imine_one_fives_D(1)>"
-"n2(-c:1:c:c:c:c:c:1)c(c(-[#1])c(c2-[#6]=[#7]-[#8]-[#1])-[#1])-[#1]","<regId=pyrrole_M(1)>"
-"n2(-[#6](-[#1])-c:1:c(:c(:c:c(:c:1-[#1])-[#1])-[#1])-[#1])c(c(-[#1])c(c2-[#6]-[#1])-[#1])-[#6]-[#1]","<regId=pyrrole_N(1)>"
-"n1(-[#6](-[#1])-[#1])c(c(-[#6](=[#8])-[#6])c(c1-[#6]:[#6])-[#6])-[#6](-[#1])-[#1]","<regId=pyrrole_O(1)>"
-"n1(-[#6])c(c(-[#1])c(c1-[#6](-[#1])=[#6](-[#6]#[#7])-c:2:n:c:c:s:2)-[#1])-[#1]","<regId=ene_cyano_G(1)>"
-"n3(-c:1:c:c:c:c:c:1-[#7](-[#1])-[#16](=[#8])(=[#8])-c:2:c:c:c:s:2)c(c(-[#1])c(c3-[#1])-[#1])-[#1]","<regId=sulfonamide_J(1)>"
-"n2(-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#6](=[#8])-[#7](-[#1])-[#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#8]-[#6]:[#6])c(c(-[#1])c(c2-[#1])-[#1])-[#1]","<regId=misc_pyrrole_benz(1)>"
-"c:1(:c:c:c:c:c:1)-[#7](-[#1])-[#6](=[#16])-[#7]-[#7](-[#1])-[#6](-[#1])=[#6](-[#1])-[#6]=[#8]","<regId=thio_urea_R(1)>"
-"[#6]-1(-[#6](=[#8])-[#6](-[#1])(-[#1])-[#6]-[#6](-[#1])(-[#1])-[#6]-1=[#8])=[#6](-[#7]-[#1])-[#6]=[#8]","<regId=ene_one_one_B(1)>"
-"[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#16]-[#6;X4]-[#16]-1","<regId=dhp_amino_CN_H(1)>"
-"[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#1])-[#1])-[#7](-[#1])-c:2:c:c:n:c:3:c(:c:c:c(:c:2:3)-[#8]-[#6](-[#1])-[#1])-[#8]-[#6](-[#1])-[#1]","<regId=het_66_anisole(1)>"
-"[#6](-[#1])(-[#1])-[#8]-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#8]-[#6](-[#1])-[#1])-[#1])-[#7](-[#1])-c:2:n:c(:c:s:2)-c:3:c:c:c(:c:c:3)-[#8]-[#6](-[#1])-[#1]","<regId=thiazole_amine_N(1)>"
-"[#6]~1~3~[#7](-[#6]:[#6])~[#6]~[#6]~[#6]~[#6]~1~[#6]~2~[#7]~[#6]~[#6]~[#6]~[#7+]~2~[#7]~3","<regId=het_pyridiniums_C(1)>"
-"[#7]-3(-c:2:c:1:c:c:c:c:c:1:c:c:c:2)-[#7]=[#6](-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#6]-3=[#8]","<regId=het_5_E(1)>"
-"[#6]-1(=[#6;!H0,$([#6]-[#6;!H0;!H1]),$([#6]-[#6]=[#8])]-[#16]-[#6](-[#7;!H0,$([#7]-[#6;!H0]),$([#7]-[#6]:[#6])]-1)=[#7;!R])-[$([#6](-[#1])-[#1]),$([#6]:[#6])]","<regId=thiaz_ene_A(128)>"
-"n2(-[#6]:1:[!#1]:[#6]:[#6]:[#6]:[#6]:1)c(cc(c2-[#6;X4])-[#1])-[#6;X4]","<regId=pyrrole_A(118)>"
-"c:1:c:c(:c(:c:c:1)-[#8]-[#1])-[#8]-[#1]","<regId=catechol_A(92)>"
-"[#6]-1(=[#6])-[#6](-[#7]=[#6]-[#16]-1)=[#8]","<regId=ene_five_het_B(90)>"
-"[#6]-1=[!#1]-[!#6&!#1]-[#6](-[#6]-1=[!#6&!#1;!R])=[#8]","<regId=imine_one_fives(89)>"
-"[#6]-1(-[#6](-[#6]=[#6]-[!#6&!#1]-1)=[#6])=[!#6&!#1]","<regId=ene_five_het_C(85)>"
-"[#6]-[#7]-1-[#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])(-[#1])-[#6]-1(-[#1])-[#1])-[#7]=[#6](-[#1])-[#6]:[!#1]","<regId=hzone_pipzn(79)>"
-"c:1-2:c(:c:c:c:c:1)-[#6](=[#8])-[#6;X4]-[#6]-2=[#8]","<regId=keto_keto_beta_A(68)>"
-"n1(-[#6])c(c(-[#1])c(c1-[#6]=[#7]-[#7])-[#1])-[#1]","<regId=hzone_pyrrol(64)>"
-"[#6]=!@[#6](-[!#1])-@[#6](=!@[!#6&!#1])-@[#6](=!@[#6])-[!#1]","<regId=ene_one_ene_A(57)>"
-"[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[#7](-[#1])-[#1])=[#6]-[#6]#[#7]","<regId=cyano_ene_amine_A(56)>"
-"c:1-2:c(:c:c:c:c:1)-[#6](=[#8])-[#6](=[#6])-[#6]-2=[#8]","<regId=ene_five_one_A(55)>"
-"[#6]-,:1(=,:[!#1]-,:[!#1]=,:[!#1]-,:[#7](-,:[#6]-,:1=[#16])-[#1])-[#6]#[#7]","<regId=cyano_pyridone_A(54)>"
-"c:1:c:c-2:c(:c:c:1)-[#6]-3-[#6](-[#6]-[#7]-2)-[#6]-[#6]=[#6]-3","<regId=anil_alk_ene(51)>"
-"c:1:c:2:c(:c:c:c:1):n:c:3:c(:c:2-[#7]):c:c:c:c:3","<regId=amino_acridine_A(46)>"
-"[#6]-1(=[#6])-[#6](=[#8])-[#7]-[#7]-[#6]-1=[#8]","<regId=ene_five_het_D(46)>"
-"[#7](-[#1])(-[#1])-c:1:c(:c(:c(:s:1)-[!#1])-[!#1])-[#6]=[#8]","<regId=thiophene_amino_Aa(45)>"
-"[#7]-[#6]=!@[#6]-2-[#6](=[#8])-c:1:c:c:c:c:c:1-[!#6&!#1]-2","<regId=ene_five_het_E(44)>"
-"c:1(:c(:c(:c(:c(:c:1-[#8]-[#1])-[F,Cl,Br,I])-[#1])-[F,Cl,Br,I])-[#1])-[#16](=[#8])(=[#8])-[#7]","<regId=sulfonamide_A(43)>"
-"[#6]-[#6](=[#16])-[#6]","<regId=thio_ketone(43)>"
-"c:1:c:c(:c:c:c:1-[#8]-[#1])-[#7](-[#1])-[#16](=[#8])=[#8]","<regId=sulfonamide_B(41)>"
-"c:1(:c(:c(:c(:c(:c:1-[#1])-[#1])-[$([#8]),$([#7]),$([#6](-[#1])-[#1])])-[#1])-[#1])-[#7](-[#1])-[#1]","<regId=anil_no_alk(40)>"
-"[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#6]:[#6])]:1:c(:c(:c(:s:1)-[#7](-[#1])-[#6](=[#8])-[#6])-[#6](=[#8])-[#8])-[$([#6]:1:[#6]:[#6]:[#6]:[#6]:[#6]:1),$([#6]:1:[#16]:[#6]:[#6]:[#6]:1)]","<regId=thiophene_amino_Ab(40)>"
-"[#7+]:1(:[#6]:[#6]:[!#1]:c:2:c:1:c(:[c;!H0,$(c-[#7])]:c:c:2)-[#1])-[$([#6](-[#1])(-[#1])-[#1]),$([#8;X1]),$([#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]),$([#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#8]-[#1]),$([#6](-[#1])(-[#1])-[#6](=[#8])-[#6]),$([#6](-[#1])(-[#1])-[#6](=[#8])-[#7](-[#1])-[#6]:[#6]),$([#6](-[#1])(-[#1])-[#6](-[#1])(-[#1])-[#1])]","<regId=het_pyridiniums_A(39)>"
-"c:1:c:c:c:c(:c:1-[#7&!H0;!H1,!$([#7]-[#6]=[#8])])-[#6](-[#6]:[#6])=[#8]","<regId=anthranil_one_A(38)>"
-"[#7](-[#1])-[#7]=[#6](-[#6]#[#7])-[#6]=[!#6&!#1;!R]","<regId=cyano_imine_A(37)>"
-"[#7](-c:1:c:c:c:c:c:1)-[#16](=[#8])(=[#8])-[#6]:2:[#6]:[#6]:[#6]:[#6]:3:[#7]:[$([#8]),$([#16])]:[#7]:[#6]:2:3","<regId=diazox_sulfon_A(36)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:c(:c:1-[#1])-[#1])-[#6](-[#1])=[#7]-[#7]-[$([#6](=[#8])-[#6](-[#1])(-[#1])-[#16]-[#6]:[#7]),$([#6](=[#8])-[#6](-[#1])(-[#1])-[!#1]:[!#1]:[#7]),$([#6](=[#8])-[#6]:[#6]-[#8]-[#1]),$([#6]:[#7]),$([#6](-[#1])(-[#1])-[#6](-[#1])-[#8]-[#1])])-[#1])-[#1]","<regId=hzone_anil_di_alk(35)>"
-"[#7]-1-[#6](=[#16])-[#16]-[#6;X4]-[#6]-1=[#8]","<regId=rhod_sat_A(33)>"
-"[#7](-[#1])-[#7]=[#6]-[#6;!H0,$([#6]-[#6])]=[#6](-[#6])-!@[$([#7]),$([#8]-[#1])]","<regId=hzone_enamin(30)>"
-"n2(-[#6]:1:[!#1]:[#6]:[#6]:[#6]:[#6]:1)c(cc(c2-[#6]:[#6])-[#1])-[#6;X4]","<regId=pyrrole_B(29)>"
-"s1ccc(c1)-[#8]-[#1]","<regId=thiophene_hydroxy(28)>"
-"[#6]-,:1(=,:[#6](-,:[#6](=[#8])-,:[#7]-,:[#6](=,:[#7]-,:1)-,:[!#6&!#1])-[#6]#[#7])-[#6]","<regId=cyano_pyridone_B(27)>"
-"[#6]-1(-[#6](=[#8])-[#7]-[#6](=[#8])-[#7]-[#6]-1=[#8])=[#7]","<regId=imine_one_sixes(27)>"
-"[#6](-[#1])(-[#1])-[#7]([#6]:[#6])~[#6][#6]=,:[#6]-[#6]~[#6][#7]","<regId=dyes5A(27)>"
-"c:2:c:1:c:c:c:c-,:3:c:1:c(:c:c:2)-,:[#7]-,:[#6]=,:[#7]-,:3","<regId=naphth_amino_A(25)>"
-"c:2:c:1:c:c:c:c-3:c:1:c(:c:c:2)-[#7](-[#6;X4]-[#7]-3-[#1])-[#1]","<regId=naphth_amino_B(25)>"
-"[#6]-[#6](=[#8])-[#6](-[#1])=[#6](-[#7](-[#1])-[#6])-[#6](=[#8])-[#8]-[#6]","<regId=ene_one_ester(24)>"
-"[#16]=[#6]-1-[#6]=,:[#6]-[!#6&!#1]-[#6]=,:[#6]-1","<regId=thio_dibenzo(23)>"
-"[#6](-[#6]#[#7])(-[#6]#[#7])-[#6](-[$([#6]#[#7]),$([#6]=[#7])])-[#6]#[#7]","<regId=cyano_cyano_A(23)>"
-"c:1:2:c(:c(:c(:c(:c:1:c(:c(:c(:c:2-[#1])-[#8]-[#1])-[#6](=[#8])-[#7](-[#1])-[#7]=[#6])-[#1])-[#1])-[#1])-[#1])-[#1]","<regId=hzone_acyl_naphthol(22)>"
-"[#8]=[#6]-c2c1nc(-[#6](-[#1])-[#1])cc(-[#8]-[#1])n1nc2","<regId=het_65_A(21)>"
-"n:1:c(:n(:c(:c:1-c:2:c:c:c:c:c:2)-c:3:c:c:c:c:c:3)-[#1])-[#6]:[!#1]","<regId=imidazole_A(19)>"
-"[#6](-[#6]#[#7])(-[#6]#[#7])=[#6]-c:1:c:c:c:c:c:1","<regId=ene_cyano_A(19)>"
-"c:1(:c:c:c:c:c:1-[#7](-[#1])-[#7]=[#6])-[#6](=[#8])-[#8]-[#1]","<regId=anthranil_acid_A(19)>"
-"[#7+]([#6]:[#6])=,:[#6]-[#6](-[#1])=[#6]-[#7](-[#6;X4])-[#6]","<regId=dyes3A(19)>"
-"[#7](-[#1])(-[#1])-[#6]-1=[#6](-[#6]#[#7])-[#6](-[#1])(-[#6]:[#6])-[#6](=[#6](-[#7](-[#1])-[#1])-[#16]-1)-[#6]#[#7]","<regId=dhp_bis_amino_CN(19)>"
-"[#7]~[#6]:1:[#7]:[#7]:[#6](:[$([#7]),$([#6]-[#1]),$([#6]-[#7]-[#1])]:[$([#7]),$([#6]-[#7])]:1)-[$([#7]-[#1]),$([#8]-[#6](-[#1])-[#1])]","<regId=het_6_tetrazine(18)>"
-"[#6]-[#6]=[#6](-[F,Cl,Br,I])-[#6](=[#8])-[#6]","<regId=ene_one_hal(17)>"
-"[#6](-[#6]#[#7])(-[#6]#[#7])=[#7]-[#7](-[#1])-c:1:c:c:c:c:c:1","<regId=cyano_imine_B(17)>"
-"[#6]-,:1(=,:[#6](-!@[#6](=[#8])-[#7]-[#6](-[#1])-[#1])-,:[#16]-,:[#6](-,:[#7]-,:1-,:[$([#6](-[#1])(-[#1])-[#6](-[#1])=[#6](-[#1])-[#1]),$([#6]:[#6])])=[#16])-,:[$([#7]-[#6](=[#8])-[#6]:[#6]),$([#7](-[#1])-[#1])]","<regId=thiaz_ene_B(17)>"
-"[#16]-1-[#6](=[#8])-[#7]-[#6](=[#8])-[#6]-1=[#6](-[#1])-[$([#6]-[#35]),$([#6]:[#6](-[#1]):[#6](-[F,Cl,Br,I]):[#6]:[#6]-[F,Cl,Br,I]),$([#6]:[#6](-[#1]):[#6](-[#1]):[#6]-[#16]-[#6](-[#1])-[#1]),$([#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]:[#6]-[#8]-[#6](-[#1])-[#1]),$([#6]:1:[#6](-[#6](-[#1])-[#1]):[#7](-[#6](-[#1])-[#1]):[#6](-[#6](-[#1])-[#1]):[#6]:1)]","<regId=ene_rhod_B(16)>"
-"[#8]-,:1-,:[#6](-,:[#16]-,:c:2:c-,:1:c:c:c(:c:2)-,:[$([#7]),$([#8])])=[$([#8]),$([#16])]","<regId=thio_carbonate_A(15)>"
-"[#7](-[#6](-[#1])-[#1])(-[#6](-[#1])-[#1])-c:1:c(:c(:c(:o:1)-[#6]=[#7]-[#7](-[#1])-[#6]=[!#6&!#1])-[#1])-[#1]","<regId=anil_di_alk_furan_A(15)>"
-"c:1(:c:c:c:c:c:1)-[#6](-[#1])=!@[#6]-3-[#6](=[#8])-c:2:c:c:c:c:c:2-[#16]-3","<regId=ene_five_het_F(15)>"
-"[#6]-1(-[#6](~[!#6&!#1]~[#6]-[!#6&!#1]-[#6]-1=[!#6&!#1])~[!#6&!#1])=[#6;!R]-[#1]","<regId=ene_six_het_A(483)>"
-"c:1:c:c(:c(:c:c:1)-[#6]=[#7]-[#7])-[#8]-[#1]","<regId=hzone_phenol_A(479)>"
-"[#6](-[#1])(-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1]),$(c-[#8]-[#6](-[#1])(-[#1])-[#6](-[#1])-[#1])](:c:1))-[#7])-[#1]","<regId=anil_di_alk_A(478)>"
-"[n;!H0,$(n-[#6;!H0;!H1])]:1(c(c(c:2:c:1:c:c:c:c:2-[#1])-[#6;X4]-[#1])-[$([#6](-[#1])-[#1]),$([#6]=,:[!#6&!#1]),$([#6](-[#1])-[#7]),$([#6](-[#1])(-[#6](-[#1])-[#1])-[#6](-[#1])(-[#1])-[#7](-[#1])-[#6](-[#1])-[#1])])","<regId=indol_3yl_alk(461)>"
-"[!#6&!#1]=[#6]-1-[#6]=,:[#6]-[#6](=[!#6&!#1])-[#6]=,:[#6]-1","<regId=quinone_A(370)>"
-"[#7;!R]=[#7]","<regId=azo_A(324)>"
-"[#6]-[#6](=[!#6&!#1;!R])-[#6](=[!#6&!#1;!R])-[$([#6]),$([#16](=[#8])=[#8])]","<regId=imine_one_A(321)>"
-"[#7]-[#6;X4]-c:1:c:c:c:c:c:1-[#8]-[#1]","<regId=mannich_A(296)>"
-"c:1:c:c(:c:c:c:1-[#7](-[#6;X4])-[#6;X4])-[#6]=[#6]","<regId=anil_di_alk_B(251)>"
-"c:1:c:c(:c:c:c:1-[#8]-[#6;X4])-[#7;$([#7!H0]-[#6;X4]),$([#7](-[#6;X4])-[#6;X4])]","<regId=anil_di_alk_C(246)>"
-"[#7]-1-[#6](=[#16])-[#16]-[#6](=[#6])-[#6]-1=[#8]","<regId=ene_rhod_A(235)>"
-"c:1(:c:c:c(:c:c:1)-[#6]=[#7]-[#7])-[#8]-[#1]","<regId=hzone_phenol_B(215)>"
-"[#6]-1(=[#6])-[#6]=[#7]-[!#6&!#1]-[#6]-1=[#8]","<regId=ene_five_het_A(201)>"
-"c:1:c:c(:c:c:c:1-[#7](-[#6;X4])-[#6;X4])-[#6;X4]-[$([#8]-[#1]),$([#6]=[#6]-[#1]),$([#7]-[#6;X4])]","<regId=anil_di_alk_D(198)>"
-"[#8]=[#6]-2-[#6](=!@[#7]-[#7])-c:1:c:c:c:c:c:1-[#7]-2","<regId=imine_one_isatin(189)>"
-"[#6](-[#1])-[#7](-[#6](-[#1])-[#1])-c:1:c(:c(:c(:[c;!H0,$(c-[#6](-[#1])-[#1])](:c:1-[#1]))-[#6&!H0;!H1,$([#6]-[#6;!H0])])-[#1])-[#1]","<regId=anil_di_alk_E(186)>"

src/edm.py CHANGED Viewed

@@ -8,7 +8,6 @@ import torch.nn.functional as F
 from src import utils
 from src.egnn import Dynamics
 from src.noise import GammaNetwork, PredefinedNoiseSchedule
-from utils.logging_utils import get_logger
 class EDM(torch.nn.Module):
@@ -733,8 +732,6 @@ class InpaintingEDM(EDM):
 class RobustEDM(EDM):
     @torch.no_grad()
     def sample_chain(self, x, h, node_mask, fragment_mask, linker_mask, edge_mask, context, keep_frames=None):
-        logger = get_logger()
         n_samples = x.size(0)
         n_nodes = x.size(1)

 from src import utils
 from src.egnn import Dynamics
 from src.noise import GammaNetwork, PredefinedNoiseSchedule
 class EDM(torch.nn.Module):
 class RobustEDM(EDM):
     @torch.no_grad()
     def sample_chain(self, x, h, node_mask, fragment_mask, linker_mask, edge_mask, context, keep_frames=None):
         n_samples = x.size(0)
         n_nodes = x.size(1)

src/lightning.py CHANGED Viewed

@@ -2,7 +2,6 @@ import numpy as np
 import os
 import pytorch_lightning as pl
 import torch
-import wandb
 from src import metrics, utils, delinker
 from src.const import LINKER_SIZE_DIST
@@ -13,7 +12,6 @@ from src.datasets import (
 )
 from src.linker_size import DistributionNodes
 from src.molecule_builder import build_molecules
-from src.visualizer import save_xyz_files, visualize_chain
 from typing import Dict, List, Optional
 from tqdm import tqdm
@@ -308,22 +306,6 @@ class DDPM(pl.LightningModule):
                 self.log(f'{metric_name}/test', metric_value, prog_bar=True)
                 self.metrics.setdefault(f'{metric_name}/test', []).append(metric_value)
-    def generate_animation(self, chain_batch, node_mask, batch_i):
-        batch_indices, mol_indices = utils.get_batch_idx_for_animation(self.batch_size, batch_i)
-        for bi, mi in zip(batch_indices, mol_indices):
-            chain = chain_batch[:, bi, :, :]
-            name = f'mol_{mi}'
-            chain_output = os.path.join(self.samples_dir, f'epoch_{self.current_epoch}', name)
-            os.makedirs(chain_output, exist_ok=True)
-            one_hot = chain[:, :, 3:-1] if self.include_charges else chain[:, :, 3:]
-            positions = chain[:, :, :3]
-            chain_node_mask = torch.cat([node_mask[bi].unsqueeze(0) for _ in range(self.FRAMES)], dim=0)
-            names = [f'{name}_{j}' for j in range(self.FRAMES)]
-            save_xyz_files(chain_output, one_hot, positions, chain_node_mask, names=names, is_geom=self.is_geom)
-            visualize_chain(chain_output, wandb=wandb, mode=name, is_geom=self.is_geom)
     def sample_and_analyze(self, dataloader):
         pred_molecules = []
         true_molecules = []

 import os
 import pytorch_lightning as pl
 import torch
 from src import metrics, utils, delinker
 from src.const import LINKER_SIZE_DIST
 )
 from src.linker_size import DistributionNodes
 from src.molecule_builder import build_molecules
 from typing import Dict, List, Optional
 from tqdm import tqdm
                 self.log(f'{metric_name}/test', metric_value, prog_bar=True)
                 self.metrics.setdefault(f'{metric_name}/test', []).append(metric_value)
     def sample_and_analyze(self, dataloader):
         pred_molecules = []
         true_molecules = []