import numpy as np import pandas as pd import re from t5_tokenizer_model import SentencePieceUnigramTokenizer #from pretokenizer import atomwise_tokenizer from tqdm import tqdm vocab_size = 32_000 input_sentence_size = None # Initialize a dataset #dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train") dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv') #print(dataset.iloc[0]) tokenizer = SentencePieceUnigramTokenizer(unk_token="", eos_token="", pad_token="") dataset=pd.DataFrame(columns=['SMILES'],data=dataset) #dataset.drop('Unnamed: 0',1) #print(dataset.columns) dataset.columns=['SMILES'] dataset.fillna('', inplace=True) #print(dataset.iloc[0]) #trainset=pd.DataFrame(columns=['SMILESs']) #dataset = dataset.iloc[:10000] for i, line in tqdm(enumerate(dataset['SMILES'])): #line = re.sub('\d+\t', '',line) #print(line) newLine=line#atomwise_tokenizer(line) #print(newLine) #print(int(i/10)) dataset.iloc[i]['SMILES']=newLine #print(dataset.loc[int(i/10)]['SMILESs']) #dataset.iloc[i]['SMILES']=newLine #dataset = dataset.iloc #print(dataset.iloc[5]['SMILESs']) # Build an iterator over this dataset def batch_iterator(input_sentence_size=None): if input_sentence_size is None: input_sentence_size = len(dataset) batch_length = 100 for i in range(0, input_sentence_size, batch_length): #print(dataset[i: i + batch_length]['SMILES']) yield dataset[i: i + batch_length]['SMILES'] # Train tokenizer tokenizer.train_from_iterator( iterator=batch_iterator(input_sentence_size=input_sentence_size), vocab_size=vocab_size, show_progress=True, ) # Save files to disk #tokenizer.save("/home/zoez/chemT5/uni-tokenizer.json") print(tokenizer.encode("O=[N+]([O-])c1ccc(Cl)cc1").tokens) #from transformers import T5Config for i, line in tqdm(enumerate(dataset['SMILES'])): #line = re.sub('\d+\t', '',line) #print(line) newLine=tokenizer.encode(line).tokens#atomwise_tokenizer(line) #print(newLine) #print(int(i/10)) dataset.iloc[i]['SMILES']=newLine #config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size()) #config.save_pretrained("./")