chemT5 / try.py
ZoeMC's picture
Saving weights and logs of step 10000
0ab88d6
#from rdkit import Chem
import tensorflow as tf
import torch as pt
#from t5_tokenizer_model import SentencePieceUnigramTokenizer
from pretokenizer import atomwise_tokenizer
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config
from tokenizers import Tokenizer
import numpy as np
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="./", from_flax=True)
tokenizer = AutoTokenizer.from_pretrained("./")
#tokenizer = Tokenizer.from_file("/home/zoez/chemT5")
#model = model.to(device)
#print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
# # # encode context the generation is conditioned on
# input_ids1 = tokenizer.encode("1",return_tensors='pt')
# print(input_ids1)
# # # activate beam search and early_stopping
# beam_output1 = model.generate(
# input_ids1,
# max_length=50,
# num_beams=5,
# early_stopping=True
# )
encoding=tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1")
print(tokenizer.convert_ids_to_tokens(encoding))
# #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)
# # set seed to reproduce results. Feel free to change the seed though to get different results
# tf.random.set_seed(0)
# # use temperature to decrease the sensitivity to low probability candidates
# sample_output = model.generate(
# input_ids1,
# do_sample=True,
# max_length=50,
# top_k=0,
# temperature=0.7
# )
# print("Output:\n" + 100 * '-')
# print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
# print("Output: 1\n" + 100 * '-')
# print(tokenizer.decode(beam_output1[0], skip_special_tokens=True))
# decoding=tokenizer.decode(beam_output1[0], skip_special_tokens=True)
# print(tokenizer.convert_ids_to_tokens(decoding))
# # encode context the generation is conditioned on
# input_ids2 = tokenizer.encode(": ",return_tensors='pt')
# # activate beam search and early_stopping
# beam_output2 = model.generate(
# input_ids2,
# max_length=50,
# num_beams=9,
# no_repeat_ngram_size=2,
# num_return_sequences=9,
# early_stopping=True
# )
# print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1"))
# print("Output: 2\n" + 100 * '-')
# print(tokenizer.decode(beam_output2[0], skip_special_tokens=True))
# # #start = latent_to_string(latent0)
# # #destination = latent_to_string(latent1)
# mols1 = []
# step = np.linspace(0,1,100)
# invalid = 0
# steps = []
# step_invalid = []
# # Generate molcules using interpolation
# for i, beam in enumerate(beam_output2):
# #target_latent = (1.0-step[i])*latent0 + step[i]*latent1
# #string = latent_to_string(target_latent)
# smiles = tokenizer.decode(beam, skip_special_tokens=True) # when using smies
# print(tokenizer.decode(beam, skip_special_tokens=True))
# #smiles = sel.decoder(string) # when using SELFIES
# mol = Chem.MolFromSmiles(smiles)
# if mol:
# if smiles not in mols1:
# mols1.append(smiles)
# steps.append(i)
# else:
# invalid = invalid + 1
# step_invalid.append(i)
# #print("starting mol:", start)
# #print('destination mol:', destination)
# print("generated mols:", mols1)