# -*- coding: utf-8 -*- """ Created on Mon May 1 19:41:07 2023 @author: Sen """ import os import subprocess import warnings from tqdm import tqdm import argparse import torch from transformers import AutoTokenizer, GPT2LMHeadModel warnings.filterwarnings('ignore') os.environ["http_proxy"] = "http://127.0.0.1:7890" os.environ["https_proxy"] = "http://127.0.0.1:7890" # Set up command line argument parsing parser = argparse.ArgumentParser() parser.add_argument('-p', type=str, default=None, help='Input the protein amino acid sequence. Default value is None. Only one of -p and -f should be specified.') parser.add_argument('-f', type=str, default=None, help='Input the FASTA file. Default value is None. Only one of -p and -f should be specified.') parser.add_argument('-l', type=str, default='', help='Input the ligand prompt. Default value is an empty string.') parser.add_argument('-n', type=int, default=100, help='Number of output molecules to generate. Default value is 100.') parser.add_argument('-d', type=str, default='cuda', help="Hardware device to use. Default value is 'cuda'.") parser.add_argument('-o', type=str, default='./ligand_output/', help="Output directory for generated molecules. Default value is './ligand_output/'.") args = parser.parse_args() protein_seq = args.p fasta_file = args.f ligand_prompt = args.l num_generated = args.n device = args.d output_path = args.o def ifno_mkdirs(dirname): if not os.path.exists(dirname): os.makedirs(dirname) ifno_mkdirs(output_path) # Function to read in FASTA file def read_fasta_file(file_path): with open(file_path, 'r') as fasta_file: sequence = [] for line in fasta_file: line = line.strip() if not line.startswith('>'): sequence.append(line) protein_sequence = ''.join(sequence) return protein_sequence # Check if the input is either a protein amino acid sequence or a FASTA file, but not both if (protein_seq is not None) != (fasta_file is not None): if fasta_file is not None: protein_seq = read_fasta_file(fasta_file) else: protein_seq = protein_seq else: print("The input should be either a protein amino acid sequence or a FASTA file, but not both.") # Load the tokenizer and the model tokenizer = AutoTokenizer.from_pretrained('liyuesen/druggpt') model = GPT2LMHeadModel.from_pretrained("liyuesen/druggpt") # Generate a prompt for the model p_prompt = "<|startoftext|>
" + protein_seq + "