import tensorflow as tf | |
import torch as pt | |
import pandas as pd | |
import re | |
from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
#from pretokenizer import atomwise_tokenizer | |
from tqdm import tqdm | |
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config | |
from tokenizers import Tokenizer | |
import numpy as np | |
tokenizer = AutoTokenizer.from_pretrained("./") | |
dataset = pd.read_csv('./chemT5_data.csv') | |
train=pd.DataFrame(data=dataset) | |
for i, line in tqdm(enumerate(dataset['SMILES'])): | |
print(i," "+line) | |
line = tokenizer.encode(line) | |
#print(line) | |
newLine=tokenizer.convert_ids_to_tokens(line) | |
#print(newLine) | |
#print(int(i/10)) | |
train.iloc[i]['SMILES']=newLine | |
#print(train[0:5]) | |
train.to_csv('pretrain.csv',index=False) |