import numpy as np
import pandas as pd
import re
from t5_tokenizer_model import SentencePieceUnigramTokenizer
#from pretokenizer import atomwise_tokenizer
from tqdm import tqdm



vocab_size = 32_000
input_sentence_size = None

# Initialize a dataset
#dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train")
dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv')
#dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
#dataset['SMILES']=dataset['SMILES'].str[2:]
# for i, line in tqdm(enumerate(dataset['SMILES'])):
#     print(line) 
#     line = re.sub('\d+ ', '',line)
#     #
#     #newLine=line#atomwise_tokenizer(line)
#     #print(newLine)
#     #print(int(i/10))
#     dataset.iloc[i]['SMILES']=line
# print(dataset[0:5])
# dataset.dropna()
#dataset.to_csv('chemT5_data.csv',index=False)

#print(dataset.iloc[0])

dataset=pd.DataFrame(columns=['SMILES'],data=dataset)
# print(dataset[0:5])
# print(dataset.columns)

# #dataset.drop('Unnamed: 0',1)
# print(dataset.columns)
# dataset.columns=['SMILES']
# for i, line in tqdm(enumerate(dataset['SMILES'])):
#     #line = re.sub('\d+ ', '',line)
#     #print(line) 
#     newLine=line#atomwise_tokenizer(line)
#     #print(newLine)
#     #print(int(i/10))
#     dataset.iloc[i]['SMILES']=newLine
# print(dataset['SMILES'][0:5])

dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)]
#print(dataset[0:5])
dataset.to_csv('chemT5_data.csv',index=False)