chemT5

File size: 3,242 Bytes

5b8d6fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ab88d6
 
 
5b8d6fc
 
0ab88d6
5b8d6fc
 
0ab88d6
 
 
5b8d6fc
0ab88d6
5b8d6fc
 
 
 
 
 
0ab88d6
 
5b8d6fc
0ab88d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b8d6fc
 
0ab88d6
 
5b8d6fc
 
0ab88d6
5b8d6fc
 
 
 
 
 
 
 
 
 
0ab88d6
5b8d6fc
0ab88d6
5b8d6fc
0ab88d6
 
5b8d6fc

#from rdkit import Chem
import tensorflow as tf
import torch as pt
#from t5_tokenizer_model import SentencePieceUnigramTokenizer
from pretokenizer import atomwise_tokenizer
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config
from tokenizers import Tokenizer
import numpy as np 


#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




#model = T5ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path="./", from_flax=True)
tokenizer = AutoTokenizer.from_pretrained("./")
#tokenizer = Tokenizer.from_file("/home/zoez/chemT5")
#model = model.to(device)

#print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)


# # # encode context the generation is conditioned on
# input_ids1 = tokenizer.encode("1",return_tensors='pt')
# print(input_ids1)

# # # activate beam search and early_stopping
# beam_output1 = model.generate(
#     input_ids1, 
#     max_length=50, 
#     num_beams=5, 
#     early_stopping=True
# )
encoding=tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1")
print(tokenizer.convert_ids_to_tokens(encoding))
# #print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens)

# # set seed to reproduce results. Feel free to change the seed though to get different results
# tf.random.set_seed(0)

# # use temperature to decrease the sensitivity to low probability candidates
# sample_output = model.generate(
#     input_ids1, 
#     do_sample=True, 
#     max_length=50, 
#     top_k=0, 
#     temperature=0.7
# )

# print("Output:\n" + 100 * '-')
# print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

# print("Output: 1\n" + 100 * '-')
# print(tokenizer.decode(beam_output1[0], skip_special_tokens=True))
# decoding=tokenizer.decode(beam_output1[0], skip_special_tokens=True)
# print(tokenizer.convert_ids_to_tokens(decoding))

# # encode context the generation is conditioned on
# input_ids2 = tokenizer.encode(": ",return_tensors='pt')

# # activate beam search and early_stopping
# beam_output2 = model.generate(
#     input_ids2, 
#     max_length=50, 
#     num_beams=9, 
#     no_repeat_ngram_size=2, 
#     num_return_sequences=9, 
#     early_stopping=True
# )
# print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1"))
# print("Output: 2\n" + 100 * '-')
# print(tokenizer.decode(beam_output2[0], skip_special_tokens=True))

# # #start = latent_to_string(latent0)
# # #destination = latent_to_string(latent1)
# mols1 = []
# step = np.linspace(0,1,100)
# invalid = 0
# steps = []
# step_invalid = []
# # Generate molcules using interpolation
# for i, beam in enumerate(beam_output2):
#     #target_latent = (1.0-step[i])*latent0 + step[i]*latent1
#     #string  = latent_to_string(target_latent)
#     smiles = tokenizer.decode(beam, skip_special_tokens=True)    # when using smies
#     print(tokenizer.decode(beam, skip_special_tokens=True))
#     #smiles = sel.decoder(string)  # when using SELFIES
#     mol = Chem.MolFromSmiles(smiles)
#     if mol:
#         if smiles not in mols1:
#             mols1.append(smiles)
#             steps.append(i)
#     else:
#         invalid = invalid + 1
#         step_invalid.append(i)
# #print("starting mol:", start)
# #print('destination mol:', destination)
# print("generated mols:", mols1)