#from rdkit import Chem import tensorflow as tf import torch as pt #from t5_tokenizer_model import SentencePieceUnigramTokenizer from pretokenizer import atomwise_tokenizer from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config from tokenizers import Tokenizer import numpy as np #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="./", from_flax=True) tokenizer = AutoTokenizer.from_pretrained("./") #tokenizer = Tokenizer.from_file("/home/zoez/chemT5") #model = model.to(device) #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens) # # # encode context the generation is conditioned on # input_ids1 = tokenizer.encode("1",return_tensors='pt') # print(input_ids1) # # # activate beam search and early_stopping # beam_output1 = model.generate( # input_ids1, # max_length=50, # num_beams=5, # early_stopping=True # ) encoding=tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1") print(tokenizer.convert_ids_to_tokens(encoding)) # #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens) # # set seed to reproduce results. Feel free to change the seed though to get different results # tf.random.set_seed(0) # # use temperature to decrease the sensitivity to low probability candidates # sample_output = model.generate( # input_ids1, # do_sample=True, # max_length=50, # top_k=0, # temperature=0.7 # ) # print("Output:\n" + 100 * '-') # print(tokenizer.decode(sample_output[0], skip_special_tokens=True)) # print("Output: 1\n" + 100 * '-') # print(tokenizer.decode(beam_output1[0], skip_special_tokens=True)) # decoding=tokenizer.decode(beam_output1[0], skip_special_tokens=True) # print(tokenizer.convert_ids_to_tokens(decoding)) # # encode context the generation is conditioned on # input_ids2 = tokenizer.encode(": ",return_tensors='pt') # # activate beam search and early_stopping # beam_output2 = model.generate( # input_ids2, # max_length=50, # num_beams=9, # no_repeat_ngram_size=2, # num_return_sequences=9, # early_stopping=True # ) # print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1")) # print("Output: 2\n" + 100 * '-') # print(tokenizer.decode(beam_output2[0], skip_special_tokens=True)) # # #start = latent_to_string(latent0) # # #destination = latent_to_string(latent1) # mols1 = [] # step = np.linspace(0,1,100) # invalid = 0 # steps = [] # step_invalid = [] # # Generate molcules using interpolation # for i, beam in enumerate(beam_output2): # #target_latent = (1.0-step[i])*latent0 + step[i]*latent1 # #string = latent_to_string(target_latent) # smiles = tokenizer.decode(beam, skip_special_tokens=True) # when using smies # print(tokenizer.decode(beam, skip_special_tokens=True)) # #smiles = sel.decoder(string) # when using SELFIES # mol = Chem.MolFromSmiles(smiles) # if mol: # if smiles not in mols1: # mols1.append(smiles) # steps.append(i) # else: # invalid = invalid + 1 # step_invalid.append(i) # #print("starting mol:", start) # #print('destination mol:', destination) # print("generated mols:", mols1)