diffdock / datasets /esm_embedding_preparation.py
amit-scans's picture
Duplicate from simonduerr/diffdock
fba25b8
import os
from argparse import FileType, ArgumentParser
import numpy as np
import pandas as pd
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm
from Bio import SeqIO
def esm_embedding_prep(out_file, protein_path):
biopython_parser = PDBParser()
three_to_one = {
"ALA": "A",
"ARG": "R",
"ASN": "N",
"ASP": "D",
"CYS": "C",
"GLN": "Q",
"GLU": "E",
"GLY": "G",
"HIS": "H",
"ILE": "I",
"LEU": "L",
"LYS": "K",
"MET": "M",
"MSE": "M", # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
"PHE": "F",
"PRO": "P",
"PYL": "O",
"SER": "S",
"SEC": "U",
"THR": "T",
"TRP": "W",
"TYR": "Y",
"VAL": "V",
"ASX": "B",
"GLX": "Z",
"XAA": "X",
"XLE": "J",
}
file_paths = [protein_path]
sequences = []
ids = []
for file_path in tqdm(file_paths):
structure = biopython_parser.get_structure("random_id", file_path)
structure = structure[0]
for i, chain in enumerate(structure):
seq = ""
for res_idx, residue in enumerate(chain):
if residue.get_resname() == "HOH":
continue
residue_coords = []
c_alpha, n, c = None, None, None
for atom in residue:
if atom.name == "CA":
c_alpha = list(atom.get_vector())
if atom.name == "N":
n = list(atom.get_vector())
if atom.name == "C":
c = list(atom.get_vector())
if (
c_alpha != None and n != None and c != None
): # only append residue if it is an amino acid
try:
seq += three_to_one[residue.get_resname()]
except Exception as e:
seq += "-"
print(
"encountered unknown AA: ",
residue.get_resname(),
" in the complex ",
file_path,
". Replacing it with a dash - .",
)
sequences.append(seq)
ids.append(f"{os.path.basename(file_path)}_chain_{i}")
records = []
for (index, seq) in zip(ids, sequences):
record = SeqRecord(Seq(seq), str(index))
record.description = ""
records.append(record)
SeqIO.write(records, out_file, "fasta")