import numpy as np | |
import pandas as pd | |
import re | |
from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
#from pretokenizer import atomwise_tokenizer | |
from tqdm import tqdm | |
vocab_size = 32_000 | |
input_sentence_size = None | |
# Initialize a dataset | |
#dataset = load_dataset('csv', data_files='/home/zoez/Chem-T5/train-file.csv',split="train") | |
dataset = pd.read_csv('./chemT5_data.csv')#('/home/zoez/Chem-T5/train-file.csv') | |
#dataset=pd.DataFrame(columns=['SMILES'],data=dataset) | |
#dataset['SMILES']=dataset['SMILES'].str[2:] | |
# for i, line in tqdm(enumerate(dataset['SMILES'])): | |
# print(line) | |
# line = re.sub('\d+ ', '',line) | |
# # | |
# #newLine=line#atomwise_tokenizer(line) | |
# #print(newLine) | |
# #print(int(i/10)) | |
# dataset.iloc[i]['SMILES']=line | |
# print(dataset[0:5]) | |
# dataset.dropna() | |
#dataset.to_csv('chemT5_data.csv',index=False) | |
#print(dataset.iloc[0]) | |
dataset=pd.DataFrame(columns=['SMILES'],data=dataset) | |
# print(dataset[0:5]) | |
# print(dataset.columns) | |
# #dataset.drop('Unnamed: 0',1) | |
# print(dataset.columns) | |
# dataset.columns=['SMILES'] | |
# for i, line in tqdm(enumerate(dataset['SMILES'])): | |
# #line = re.sub('\d+ ', '',line) | |
# #print(line) | |
# newLine=line#atomwise_tokenizer(line) | |
# #print(newLine) | |
# #print(int(i/10)) | |
# dataset.iloc[i]['SMILES']=newLine | |
# print(dataset['SMILES'][0:5]) | |
dataset=dataset[~dataset.SMILES.str.contains("\"\"", regex=False,na=True)] | |
#print(dataset[0:5]) | |
dataset.to_csv('chemT5_data.csv',index=False) |