import tensorflow as tf import torch as pt import pandas as pd import re from t5_tokenizer_model import SentencePieceUnigramTokenizer #from pretokenizer import atomwise_tokenizer from tqdm import tqdm from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, T5Config from tokenizers import Tokenizer import numpy as np tokenizer = AutoTokenizer.from_pretrained("./") dataset = pd.read_csv('./chemT5_data.csv') train=pd.DataFrame(data=dataset) for i, line in tqdm(enumerate(dataset['SMILES'])): print(i," "+line) line = tokenizer.encode(line) #print(line) newLine=tokenizer.convert_ids_to_tokens(line) #print(newLine) #print(int(i/10)) train.iloc[i]['SMILES']=newLine #print(train[0:5]) train.to_csv('pretrain.csv',index=False)