Spaces:

simonduerr
/

diffdock

Sleeping

diffdock / datasets /esm_embedding_preparation.py

Simon Duerr

gradio update

486fd8a about 2 years ago

2.74 kB

	import os
	from argparse import FileType, ArgumentParser

	import numpy as np
	import pandas as pd
	from Bio.PDB import PDBParser
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from tqdm import tqdm
	from Bio import SeqIO



	def esm_embedding_prep(out_file, protein_path):
	biopython_parser = PDBParser()

	three_to_one = {
	"ALA": "A",
	"ARG": "R",
	"ASN": "N",
	"ASP": "D",
	"CYS": "C",
	"GLN": "Q",
	"GLU": "E",
	"GLY": "G",
	"HIS": "H",
	"ILE": "I",
	"LEU": "L",
	"LYS": "K",
	"MET": "M",
	"MSE": "M", # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
	"PHE": "F",
	"PRO": "P",
	"PYL": "O",
	"SER": "S",
	"SEC": "U",
	"THR": "T",
	"TRP": "W",
	"TYR": "Y",
	"VAL": "V",
	"ASX": "B",
	"GLX": "Z",
	"XAA": "X",
	"XLE": "J",
	}

	file_paths = [protein_path]
	sequences = []
	ids = []
	for file_path in tqdm(file_paths):
	structure = biopython_parser.get_structure("random_id", file_path)
	structure = structure[0]
	for i, chain in enumerate(structure):
	seq = ""
	for res_idx, residue in enumerate(chain):
	if residue.get_resname() == "HOH":
	continue
	residue_coords = []
	c_alpha, n, c = None, None, None
	for atom in residue:
	if atom.name == "CA":
	c_alpha = list(atom.get_vector())
	if atom.name == "N":
	n = list(atom.get_vector())
	if atom.name == "C":
	c = list(atom.get_vector())
	if (
	c_alpha != None and n != None and c != None
	): # only append residue if it is an amino acid
	try:
	seq += three_to_one[residue.get_resname()]
	except Exception as e:
	seq += "-"
	print(
	"encountered unknown AA: ",
	residue.get_resname(),
	" in the complex ",
	file_path,
	". Replacing it with a dash - .",
	)
	sequences.append(seq)
	ids.append(f"{os.path.basename(file_path)}_chain_{i}")
	records = []
	for (index, seq) in zip(ids, sequences):
	record = SeqRecord(Seq(seq), str(index))
	record.description = ""
	records.append(record)
	SeqIO.write(records, out_file, "fasta")