chemT5 / dataset-clean.py
ZoeMC's picture
Saving weights and logs of step 10000
0ab88d6
import numpy as np
import pandas as pd
import re
from t5_tokenizer_model import SentencePieceUnigramTokenizer
#from pretokenizer import atomwise_tokenizer
from tqdm import tqdm
vocab_size = 32_000
input_sentence_size = None
# Initialize a dataset
#dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train")
dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv')
#dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
#dataset['SMILES']=dataset['SMILES'].str[2:]
# for i, line in tqdm(enumerate(dataset['SMILES'])):
# print(line)
# line = re.sub('\d+ ', '',line)
# #
# #newLine=line#atomwise_tokenizer(line)
# #print(newLine)
# #print(int(i/10))
# dataset.iloc[i]['SMILES']=line
# print(dataset[0:5])
# dataset.dropna()
#dataset.to_csv('chemT5_data.csv',index=False)
#print(dataset.iloc[0])
dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
# print(dataset[0:5])
# print(dataset.columns)
# #dataset.drop('Unnamed: 0',1)
# print(dataset.columns)
# dataset.columns=['SMILES']
# for i, line in tqdm(enumerate(dataset['SMILES'])):
# #line = re.sub('\d+ ', '',line)
# #print(line)
# newLine=line#atomwise_tokenizer(line)
# #print(newLine)
# #print(int(i/10))
# dataset.iloc[i]['SMILES']=newLine
# print(dataset['SMILES'][0:5])
dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)]
#print(dataset[0:5])
dataset.to_csv('chemT5_data.csv',index=False)