chemT5 / pretrain_data.py
ZoeMC's picture
Saving weights and logs of step 10000
0ab88d6
import tensorflow as tf
import torch as pt
import pandas as pd
import re
from t5_tokenizer_model import SentencePieceUnigramTokenizer
#from pretokenizer import atomwise_tokenizer
from tqdm import tqdm
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config
from tokenizers import Tokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("./")
dataset = pd.read_csv('./chemT5_data.csv')
train=pd.DataFrame(data=dataset)
for i, line in tqdm(enumerate(dataset['SMILES'])):
print(i," "+line)
line = tokenizer.encode(line)
#print(line)
newLine=tokenizer.convert_ids_to_tokens(line)
#print(newLine)
#print(int(i/10))
train.iloc[i]['SMILES']=newLine
#print(train[0:5])
train.to_csv('pretrain.csv',index=False)