import numpy as np import pandas as pd import re from t5_tokenizer_model import SentencePieceUnigramTokenizer #from pretokenizer import atomwise_tokenizer from tqdm import tqdm vocab_size = 32_000 input_sentence_size = None # Initialize a dataset #dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train") dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv') #dataset=pd.DataFrame(columns=['SMILES'],data=dataset) #dataset['SMILES']=dataset['SMILES'].str[2:] # for i, line in tqdm(enumerate(dataset['SMILES'])): # print(line) # line = re.sub('\d+ ', '',line) # # # #newLine=line#atomwise_tokenizer(line) # #print(newLine) # #print(int(i/10)) # dataset.iloc[i]['SMILES']=line # print(dataset[0:5]) # dataset.dropna() #dataset.to_csv('chemT5_data.csv',index=False) #print(dataset.iloc[0]) dataset=pd.DataFrame(columns=['SMILES'],data=dataset) # print(dataset[0:5]) # print(dataset.columns) # #dataset.drop('Unnamed: 0',1) # print(dataset.columns) # dataset.columns=['SMILES'] # for i, line in tqdm(enumerate(dataset['SMILES'])): # #line = re.sub('\d+ ', '',line) # #print(line) # newLine=line#atomwise_tokenizer(line) # #print(newLine) # #print(int(i/10)) # dataset.iloc[i]['SMILES']=newLine # print(dataset['SMILES'][0:5]) dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)] #print(dataset[0:5]) dataset.to_csv('chemT5_data.csv',index=False)