#from rdkit import Chem | |
import tensorflow as tf | |
import torch as pt | |
#from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
from pretokenizer import atomwise_tokenizer | |
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config | |
from tokenizers import Tokenizer | |
import numpy as np | |
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
#model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="./", from_flax=True) | |
tokenizer = AutoTokenizer.from_pretrained("./") | |
#tokenizer = Tokenizer.from_file("/home/zoez/chemT5") | |
#model = model.to(device) | |
#print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens) | |
# # # encode context the generation is conditioned on | |
# input_ids1 = tokenizer.encode("1",return_tensors='pt') | |
# print(input_ids1) | |
# # # activate beam search and early_stopping | |
# beam_output1 = model.generate( | |
# input_ids1, | |
# max_length=50, | |
# num_beams=5, | |
# early_stopping=True | |
# ) | |
encoding=tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1") | |
print(tokenizer.convert_ids_to_tokens(encoding)) | |
# #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens) | |
# # set seed to reproduce results. Feel free to change the seed though to get different results | |
# tf.random.set_seed(0) | |
# # use temperature to decrease the sensitivity to low probability candidates | |
# sample_output = model.generate( | |
# input_ids1, | |
# do_sample=True, | |
# max_length=50, | |
# top_k=0, | |
# temperature=0.7 | |
# ) | |
# print("Output:\n" + 100 * '-') | |
# print(tokenizer.decode(sample_output[0], skip_special_tokens=True)) | |
# print("Output: 1\n" + 100 * '-') | |
# print(tokenizer.decode(beam_output1[0], skip_special_tokens=True)) | |
# decoding=tokenizer.decode(beam_output1[0], skip_special_tokens=True) | |
# print(tokenizer.convert_ids_to_tokens(decoding)) | |
# # encode context the generation is conditioned on | |
# input_ids2 = tokenizer.encode(": ",return_tensors='pt') | |
# # activate beam search and early_stopping | |
# beam_output2 = model.generate( | |
# input_ids2, | |
# max_length=50, | |
# num_beams=9, | |
# no_repeat_ngram_size=2, | |
# num_return_sequences=9, | |
# early_stopping=True | |
# ) | |
# print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1")) | |
# print("Output: 2\n" + 100 * '-') | |
# print(tokenizer.decode(beam_output2[0], skip_special_tokens=True)) | |
# # #start = latent_to_string(latent0) | |
# # #destination = latent_to_string(latent1) | |
# mols1 = [] | |
# step = np.linspace(0,1,100) | |
# invalid = 0 | |
# steps = [] | |
# step_invalid = [] | |
# # Generate molcules using interpolation | |
# for i, beam in enumerate(beam_output2): | |
# #target_latent = (1.0-step[i])*latent0 + step[i]*latent1 | |
# #string = latent_to_string(target_latent) | |
# smiles = tokenizer.decode(beam, skip_special_tokens=True) # when using smies | |
# print(tokenizer.decode(beam, skip_special_tokens=True)) | |
# #smiles = sel.decoder(string) # when using SELFIES | |
# mol = Chem.MolFromSmiles(smiles) | |
# if mol: | |
# if smiles not in mols1: | |
# mols1.append(smiles) | |
# steps.append(i) | |
# else: | |
# invalid = invalid + 1 | |
# step_invalid.append(i) | |
# #print("starting mol:", start) | |
# #print('destination mol:', destination) | |
# print("generated mols:", mols1) | |