ritwikm commited on
Commit
b7c468b
·
1 Parent(s): 258464b

added code files

Browse files
code/data_preprocessing.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, glob, string
2
+ import math
3
+ from tqdm import tqdm
4
+ from transformers import AutoTokenizer
5
+ import torch
6
+ tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
7
+ from nltk.tokenize import sent_tokenize
8
+
9
+ # ----------------------------- Cleaning process 1/2 -----------------------------
10
+
11
+ def sanitize(line):
12
+ # print('before', line)
13
+ line2 = re.sub(r'\[.+\]','',line)
14
+ # print('after',line2)
15
+ for a in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
16
+ line2 = line2.replace(a,'')
17
+ line2 = re.sub(r'\b[A-Z]+\b','',line2.strip())
18
+ line2 = re.sub(r'\d','',line2)
19
+ line2 = line2.translate(str.maketrans('','',"‟“’❝❞‚‘‛❛❜❟’")) #just removed the quotes
20
+ line2 = line2.translate(str.maketrans('','',string.punctuation))
21
+ line2 = re.sub(r'\s+',' ',line2).strip()
22
+ return line2
23
+
24
+ def remove_footnotes_and_clean(sents):
25
+ sents = [x.replace("'",'').replace('*','').replace('’®','').replace('’','') for x in sents]
26
+ s = ''
27
+ for line in sents:
28
+ try:
29
+ if line.strip()[-1] != '-':
30
+ s = s + line.strip() + ' '
31
+ else:
32
+ s = s + line.strip()
33
+ except:
34
+ print(sents)
35
+ input()
36
+ s = re.sub(r'\s+',' ',s)
37
+ return s
38
+
39
+ path = 'text_files/'
40
+ ml = sorted(glob.glob(path+'*.txt'))
41
+ show = False
42
+
43
+ path = 'clean_text_files/'
44
+ for k,m in enumerate(tqdm(ml, total=len(ml), ncols=100)):
45
+ # m = ml[-1]
46
+ # if k < 67:
47
+ # continue
48
+ file = open(m,'r')
49
+ content = file.readlines()
50
+ file.close()
51
+
52
+ if show:
53
+ print(m)
54
+
55
+ paras = []
56
+ sents = []
57
+
58
+ mean_spaces = []
59
+ footnote_found = False
60
+
61
+ for line in content:
62
+ line2 = sanitize(line)
63
+ if re.search(r'^\W\s\w',line.strip()):
64
+ footnote_found = True
65
+ if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip():
66
+ # new page
67
+ footnote_found = False
68
+
69
+ if len(line2) > 5 and len(line2.split()) > 4 and footnote_found==False:
70
+ if show:
71
+ print(line.rstrip(),end='')
72
+ li_spaces = len(line) - len(line.strip())
73
+ if show:
74
+ print(li_spaces)
75
+ mean_spaces.append(li_spaces)
76
+ # input()
77
+
78
+ mean_spaces = math.floor(sum(mean_spaces)/len(mean_spaces))
79
+ if show:
80
+ print('ms',mean_spaces)
81
+ print(' '*mean_spaces+'^')
82
+ footnote_found = False
83
+ last_spaces = -1
84
+ i = 0
85
+ while i < len(content)-1:
86
+ # line2 = re.sub(r'[A-Z]','',line.strip())
87
+ # line2 = re.sub(r'\[\w+\]','',line2)
88
+ line = content[i]
89
+ li_spaces = len(line) - len(line.strip())
90
+ if re.search(r'^\W\s\w',line.strip()):
91
+ footnote_found = True
92
+ if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip():
93
+ # new page
94
+ footnote_found = False
95
+ i+=1
96
+ # print('--',line.rstrip())
97
+ continue
98
+ if footnote_found == False:
99
+ if not (li_spaces > mean_spaces):
100
+ # when the spaces in current line is equal or one tab shy from the mean spaces
101
+ line2 = sanitize(line)
102
+ if len(line2) > 5 and len(line2.split()) > 4:
103
+ if show:
104
+ print('++',line.rstrip())
105
+ sents.append(line)
106
+ last_spaces = li_spaces
107
+ elif last_spaces == li_spaces:
108
+ if show:
109
+ print('++',line.rstrip())
110
+ sents.append(line)
111
+ else:
112
+ last_spaces = -1
113
+ if show:
114
+ print('--',line.rstrip())
115
+ else:
116
+ # the current line has more or less spaces as compared to the mean
117
+ next_line = content[i+1]
118
+ lj_spaces = len(next_line) - len(next_line.strip())
119
+ if not (lj_spaces > mean_spaces):
120
+ # print('b4', line)
121
+ line1 = sanitize(content[i])
122
+ line2 = sanitize(next_line)
123
+ # print('now',line2)
124
+ if len(line1) > 5 and len(line1.split()) > 4 and len(line2) > 5 and len(line2.split()) > 4:
125
+ sent_text = remove_footnotes_and_clean(sents)
126
+ paras.append(sent_text)
127
+ if show:
128
+ print('++',line.rstrip(),'<------NEW PARA')
129
+ sents = [line]
130
+ # print('$$',paras[-1])
131
+ else:
132
+ last_spaces = -1
133
+ if show:
134
+ print('--',line.rstrip())
135
+ else:
136
+ last_spaces = -1
137
+ if show:
138
+ print('--',line.rstrip())
139
+ else:
140
+ last_spaces = -1
141
+ if show:
142
+ print('--',line.rstrip())
143
+ if show:
144
+ input('wait')
145
+ i+=1
146
+
147
+ file = open(path+m.split('/')[-1],'w')
148
+ file.write('\n'.join(paras[1:]))
149
+ file.close()
150
+ # input('here wait')
151
+
152
+
153
+
154
+ # ----------------------------- Cleaning process 2/2 -----------------------------
155
+ path = 'clean_text_files/'
156
+ ml = sorted(glob.glob(path+'*.txt'))
157
+
158
+ text = []
159
+
160
+ for m in tqdm(range(1,99)):
161
+ file = open(path+str(m)+'.txt','r')
162
+ text += file.readlines()
163
+ file.close()
164
+
165
+ file = open('all_paras.txt','w')
166
+ file.write(''.join(text))
167
+ file.close()
168
+
169
+ sents = []
170
+ tcsents = [] # transformer compatible sents
171
+ para_stack = []
172
+ for para in tqdm(text):
173
+ para = para.strip()
174
+ sents += sent_tokenize(para)
175
+ para_stack = [para] + para_stack
176
+ while len(para_stack)!=0:
177
+ top_para = para_stack.pop(0)
178
+ if len(tokenizer('<|startoftext|>'+ top_para + '<|endoftext|>')['input_ids']) > 200: # <-------------
179
+ ts = sent_tokenize(top_para)
180
+ if len(ts) > 1:
181
+ para_stack = [' '.join(ts[int(len(ts)/2):])] + para_stack # second half
182
+ para_stack = [' '.join(ts[:int(len(ts)/2)])] + para_stack # first half
183
+ else:
184
+ tcsents.append(top_para)
185
+ else:
186
+ tcsents.append(top_para)
187
+
188
+
189
+ file = open('all_sents.txt','w')
190
+ file.write('\n'.join(sents))
191
+ file.close()
192
+
193
+ file = open('all_tc_sents_200.txt','w')
194
+ file.write('\n'.join(tcsents))
195
+ file.close()
196
+
197
+
198
+
code/file.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b2a57a58bc338df0a10eb28d73efe347d820bdd58a271b1f032562c8a857aa2
3
+ size 1112205
code/gpt-finetune.py ADDED
@@ -0,0 +1,904 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import datetime
4
+
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import numpy as np
8
+ import random
9
+
10
+ import matplotlib.pyplot as plt
11
+
12
+ import torch
13
+ from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
14
+
15
+
16
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
17
+ from transformers import AdamW, get_linear_schedule_with_warmup
18
+
19
+ import nltk
20
+ nltk.download('punkt')
21
+
22
+ import sys
23
+
24
+ import pytz
25
+ IST = pytz.timezone('Asia/Kolkata')
26
+ stamp = datetime.datetime.now(IST).strftime("%c")
27
+
28
+ print('\n')
29
+ print('='*100)
30
+ print('='*100)
31
+ print('\t\t=Experiment6=',stamp)
32
+ print('='*100)
33
+ print('='*100)
34
+
35
+ out_path = '/media/data_dump/Ritwik/ggpt/'
36
+
37
+
38
+ # for i in range(10):
39
+ # print(i)
40
+ # time.sleep(1)
41
+
42
+
43
+ # exit()
44
+
45
+ hyper_params = {'rseed': 123}
46
+
47
+ import torch, numpy as np, random, transformers, psutil, time
48
+ os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed'])
49
+ # Torch RNG
50
+ torch.manual_seed(hyper_params['rseed'])
51
+ torch.cuda.manual_seed(hyper_params['rseed'])
52
+ torch.cuda.manual_seed_all(hyper_params['rseed'])
53
+ # Python RNG
54
+ np.random.seed(hyper_params['rseed'])
55
+ random.seed(hyper_params['rseed'])
56
+ transformers.set_seed(hyper_params['rseed'])
57
+
58
+ # Load the GPT tokenizer.
59
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
60
+
61
+ sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt'
62
+ print(sfile)
63
+ file = open(sfile,'r')
64
+ lines = file.readlines()
65
+ file.close()
66
+ lines = [[x.strip()] for x in lines]
67
+
68
+ df = pd.DataFrame(lines, columns=['bio_main'])
69
+
70
+ print('Dataframe created')
71
+ df.dropna(inplace=True) #remove NA values
72
+ bios = df.bio_main.copy()
73
+ print(datetime.datetime.now(IST).strftime("%c"))
74
+
75
+ # doc_lengths = []
76
+ # for bio in bios:
77
+ # # get rough token count distribution
78
+ # tokens = nltk.word_tokenize(bio)
79
+ # doc_lengths.append(len(tokens))
80
+ # doc_lengths = np.array(doc_lengths)
81
+ # a = sns.distplot(doc_lengths)
82
+ # a.get_figure().savefig(out_path+"out.png")
83
+ # print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
84
+ # print('np.average(doc_lengths)',np.average(doc_lengths))
85
+ # print(datetime.datetime.now(IST).strftime("%c"))
86
+
87
+
88
+ print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
89
+ print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
90
+ print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
91
+ print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
92
+ print(datetime.datetime.now(IST).strftime("%c"))
93
+
94
+ batch_size = 8
95
+
96
+ class GPT2Dataset(Dataset):
97
+
98
+ def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
99
+
100
+ self.tokenizer = tokenizer
101
+ self.max_length = max_length
102
+ # self.input_ids = []
103
+ # self.attn_masks = []
104
+ self.sents = list(txt_list)
105
+
106
+ # for txt in txt_list:
107
+ # ###self.sents.append(txt)
108
+
109
+ # encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
110
+
111
+ # self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
112
+ # self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
113
+
114
+ def __len__(self):
115
+ # return len(self.input_ids)
116
+ return len(self.sents)
117
+
118
+ def __getitem__(self, idx):
119
+ # return self.input_ids[idx], self.attn_masks[idx]
120
+ txt = self.sents[idx]
121
+ encodings_dict = self.tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=self.max_length, padding="max_length")
122
+ input_ids = torch.tensor(encodings_dict['input_ids'])
123
+ attn_masks = torch.tensor(encodings_dict['attention_mask'])
124
+ return input_ids, attn_masks
125
+
126
+ dataset = GPT2Dataset(bios, tokenizer, max_length=500)
127
+
128
+ # temp_dataloader = DataLoader(
129
+ # dataset, # The training samples.
130
+ # sampler = RandomSampler(dataset), # Select batches randomly
131
+ # batch_size = batch_size # Trains with this batch size.
132
+ # )
133
+
134
+ # for temp in temp_dataloader:
135
+ # print(temp)
136
+ # print(temp[0].shape)
137
+ # input()
138
+
139
+ # Split into training and validation sets
140
+ train_size = int(0.9 * len(dataset))
141
+ val_size = len(dataset) - train_size
142
+
143
+ train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
144
+
145
+ print('{:>5,} training samples'.format(train_size))
146
+ print('{:>5,} validation samples'.format(val_size))
147
+ print(datetime.datetime.now(IST).strftime("%c"))
148
+
149
+ # Create the DataLoaders for our training and validation datasets.
150
+ # We'll take training samples in random order.
151
+ train_dataloader = DataLoader(
152
+ train_dataset, # The training samples.
153
+ sampler = RandomSampler(train_dataset), # Select batches randomly
154
+ batch_size = batch_size # Trains with this batch size.
155
+ )
156
+
157
+ # For validation the order doesn't matter, so we'll just read them sequentially.
158
+ validation_dataloader = DataLoader(
159
+ val_dataset, # The validation samples.
160
+ sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
161
+ batch_size = batch_size # Evaluate with this batch size.
162
+ )
163
+
164
+
165
+ # I'm not really doing anything with the config buheret
166
+ configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
167
+
168
+ # instantiate the model
169
+ model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
170
+
171
+ # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
172
+ # otherwise the tokenizer and model tensors won't match up
173
+ model.resize_token_embeddings(len(tokenizer))
174
+
175
+ # Tell pytorch to run this model on the GPU.
176
+ device = torch.device("cuda")
177
+
178
+ model = model.to(device)
179
+
180
+ print('Model loaded to GPU')
181
+ print(datetime.datetime.now(IST).strftime("%c"))
182
+
183
+ # checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
184
+ # print(model.load_state_dict(checkpoint['state_dict']))
185
+ # del checkpoint
186
+ # tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')
187
+
188
+ # some parameters I cooked up that work reasonably well
189
+
190
+ epochs = 1
191
+ learning_rate = 5e-4
192
+ warmup_steps = 1e2
193
+ epsilon = 1e-8
194
+
195
+ # this produces sample output every 100 steps
196
+ sample_every = 1000
197
+
198
+ # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
199
+ optimizer = AdamW(model.parameters(),
200
+ lr = learning_rate,
201
+ eps = epsilon
202
+ )
203
+
204
+ # Total number of training steps is [number of batches] x [number of epochs].
205
+ # (Note that this is not the same as the number of training samples).
206
+ total_steps = len(train_dataloader) * epochs
207
+
208
+ # Create the learning rate scheduler.
209
+ # This changes the learning rate as the training loop progresses
210
+ scheduler = get_linear_schedule_with_warmup(optimizer,
211
+ num_warmup_steps = warmup_steps,
212
+ num_training_steps = total_steps)
213
+
214
+
215
+
216
+
217
+ def format_time(elapsed):
218
+ return str(datetime.timedelta(seconds=int(round((elapsed)))))
219
+
220
+ output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'
221
+
222
+ # Create output directory if needed
223
+ if not os.path.exists(output_dir):
224
+ os.makedirs(output_dir)
225
+
226
+ total_t0 = time.time()
227
+
228
+ training_stats = []
229
+
230
+ last_epoch, last_step = -1, -1
231
+ try:
232
+ file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r')
233
+ content = [x.split(':') for x in file.read().split('|')]
234
+ file.close()
235
+ except:
236
+ content = []
237
+
238
+ if len(content) == 2:
239
+ last_epoch = int(content[1][1])
240
+ last_step = int(content[0][1])
241
+
242
+ checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar')
243
+ print(model.load_state_dict(checkpoint['state_dict']))
244
+ tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
245
+ print(datetime.datetime.now(IST).strftime("%c"))
246
+ # else:
247
+ # print(content)
248
+ # input('wait')
249
+
250
+
251
+ for epoch_i in range(0, epochs):
252
+
253
+ # ========================================
254
+ # Training
255
+ # ========================================
256
+
257
+ print("")
258
+ print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
259
+ print('Training...')
260
+
261
+ if last_epoch!=-1:
262
+ if epoch_i < last_epoch:
263
+ continue
264
+
265
+ t0 = time.time()
266
+
267
+ total_train_loss = 0
268
+
269
+ model.train()
270
+
271
+ for step, batch in enumerate(train_dataloader):
272
+
273
+ if last_step != -1:
274
+ if step <= last_step:
275
+ continue
276
+
277
+ b_input_ids = batch[0].to(device)
278
+ b_labels = batch[0].to(device)
279
+ b_masks = batch[1].to(device)
280
+
281
+ model.zero_grad()
282
+
283
+ outputs = model( b_input_ids,
284
+ labels=b_labels,
285
+ attention_mask = b_masks,
286
+ token_type_ids=None
287
+ )
288
+
289
+ loss = outputs[0]
290
+
291
+ batch_loss = loss.item()
292
+ total_train_loss += batch_loss
293
+
294
+ # Get sample every x batches. Ignoring the first step.
295
+ if step % sample_every == 0 and not step == 0:
296
+
297
+ elapsed = format_time(time.time() - t0)
298
+ print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
299
+
300
+ model.eval()
301
+
302
+ sample_outputs = model.generate(
303
+ bos_token_id=random.randint(1,30000),
304
+ do_sample=True,
305
+ top_k=50,
306
+ max_length = 200,
307
+ top_p=0.95,
308
+ num_return_sequences=1
309
+ )
310
+ for i, sample_output in enumerate(sample_outputs):
311
+ print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
312
+
313
+ model.train()
314
+
315
+ try:
316
+ torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar')
317
+ torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
318
+ file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
319
+ file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
320
+ file.close()
321
+ except:
322
+ torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar')
323
+ torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
324
+ file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
325
+ file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
326
+ file.close()
327
+
328
+ loss.backward()
329
+
330
+ optimizer.step()
331
+
332
+ scheduler.step()
333
+
334
+ last_epoch, last_step = -1, -1
335
+ # Calculate the average loss over all of the batches.
336
+ avg_train_loss = total_train_loss / len(train_dataloader)
337
+
338
+ # Measure how long this epoch took.
339
+ training_time = format_time(time.time() - t0)
340
+
341
+ print("")
342
+ print(" Average training loss: {0:.2f}".format(avg_train_loss))
343
+ print(" Training epoch took: {:}".format(training_time))
344
+ print(datetime.datetime.now(IST).strftime("%c"))
345
+
346
+ # ========================================
347
+ # Validation
348
+ # ========================================
349
+
350
+ print("")
351
+ print("Running Validation...")
352
+
353
+ t0 = time.time()
354
+
355
+ model.eval()
356
+
357
+ total_eval_loss = 0
358
+ nb_eval_steps = 0
359
+
360
+ # Evaluate data for one epoch
361
+ for batch in validation_dataloader:
362
+
363
+ b_input_ids = batch[0].to(device)
364
+ b_labels = batch[0].to(device)
365
+ b_masks = batch[1].to(device)
366
+
367
+ with torch.no_grad():
368
+
369
+ outputs = model(b_input_ids,
370
+ # token_type_ids=None,
371
+ attention_mask = b_masks,
372
+ labels=b_labels)
373
+
374
+ loss = outputs[0]
375
+
376
+ batch_loss = loss.item()
377
+ total_eval_loss += batch_loss
378
+
379
+ avg_val_loss = total_eval_loss / len(validation_dataloader)
380
+
381
+ validation_time = format_time(time.time() - t0)
382
+
383
+ print(" Validation Loss: {0:.2f}".format(avg_val_loss))
384
+ print(" Validation took: {:}".format(validation_time))
385
+
386
+ # Record all statistics from this epoch.
387
+ training_stats.append(
388
+ {
389
+ 'epoch': epoch_i + 1,
390
+ 'Training Loss': avg_train_loss,
391
+ 'Valid. Loss': avg_val_loss,
392
+ 'Training Time': training_time,
393
+ 'Validation Time': validation_time
394
+ }
395
+ )
396
+
397
+ print("")
398
+ print("Training complete!")
399
+ print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
400
+ print(datetime.datetime.now(IST).strftime("%c"))
401
+
402
+ try:
403
+ # Display floats with two decimal places.
404
+ pd.set_option('precision', 2)
405
+
406
+ # Create a DataFrame from our training statistics.
407
+ df_stats = pd.DataFrame(data=training_stats)
408
+
409
+ # Use the 'epoch' as the row index.
410
+ df_stats = df_stats.set_index('epoch')
411
+
412
+ # A hack to force the column headers to wrap.
413
+ # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
414
+
415
+ # Display the table.
416
+ print(df_stats)
417
+
418
+ # Use plot styling from seaborn.
419
+ sns.set(style='darkgrid')
420
+
421
+ # Increase the plot size and font size.
422
+ sns.set(font_scale=1.5)
423
+ plt.rcParams["figure.figsize"] = (12,6)
424
+
425
+ # Plot the learning curve.
426
+ plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
427
+ plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
428
+
429
+ # Label the plot.
430
+ plt.title("Training & Validation Loss")
431
+ plt.xlabel("Epoch")
432
+ plt.ylabel("Loss")
433
+ plt.legend()
434
+ plt.xticks([1, 2, 3, 4])
435
+
436
+ # plt.show()
437
+ plt.savefig(out_path+"training.png")
438
+
439
+ # Get all of the model's parameters as a list of tuples.
440
+ params = list(model.named_parameters())
441
+
442
+ print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))
443
+
444
+ print('==== Embedding Layer ====\n')
445
+
446
+ for p in params[0:2]:
447
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
448
+
449
+ print('\n==== First Transformer ====\n')
450
+
451
+ for p in params[2:14]:
452
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
453
+
454
+ print('\n==== Output Layer ====\n')
455
+
456
+ for p in params[-2:]:
457
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
458
+
459
+ # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
460
+
461
+ print("Saving model to %s" % output_dir)
462
+
463
+ # Save a trained model, configuration and tokenizer using `save_pretrained()`.
464
+ # They can then be reloaded using `from_pretrained()`
465
+ # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
466
+ # way 1
467
+ model.save_pretrained(output_dir)
468
+ tokenizer.save_pretrained(output_dir)
469
+
470
+ # way 2
471
+ # torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar')
472
+
473
+ except Exception as e:
474
+ print(e)
475
+ print('Waiting for 10 seconds')
476
+ time.sleep(10)
477
+
478
+ # ========================= Gandhi Data =======================
479
+
480
+ sfile = 'all_tc_sents_768.txt'
481
+ print(sfile)
482
+ file = open(sfile,'r')
483
+ lines = file.readlines()
484
+ file.close()
485
+ lines = [[x.strip()] for x in lines]
486
+
487
+ df = pd.DataFrame(lines, columns=['bio_main'])
488
+
489
+ print('Dataframe created')
490
+ df.dropna(inplace=True) #remove NA values
491
+ bios = df.bio_main.copy()
492
+
493
+ doc_lengths = []
494
+ for bio in bios:
495
+ # get rough token count distribution
496
+ tokens = nltk.word_tokenize(bio)
497
+ doc_lengths.append(len(tokens))
498
+ doc_lengths = np.array(doc_lengths)
499
+ a = sns.distplot(doc_lengths)
500
+ a.get_figure().savefig(out_path+"out.png")
501
+ print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
502
+ print('np.average(doc_lengths)',np.average(doc_lengths))
503
+ print(datetime.datetime.now(IST).strftime("%c"))
504
+
505
+
506
+ print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
507
+ print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
508
+ print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
509
+ print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
510
+ print(datetime.datetime.now(IST).strftime("%c"))
511
+
512
+ batch_size = 4
513
+
514
+ class GPT2Dataset(Dataset):
515
+
516
+ def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
517
+
518
+ self.tokenizer = tokenizer
519
+ self.input_ids = []
520
+ self.attn_masks = []
521
+
522
+ for txt in txt_list:
523
+
524
+ encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
525
+
526
+ self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
527
+ self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
528
+
529
+ def __len__(self):
530
+ return len(self.input_ids)
531
+
532
+ def __getitem__(self, idx):
533
+ return self.input_ids[idx], self.attn_masks[idx]
534
+
535
+ dataset = GPT2Dataset(bios, tokenizer, max_length=768)
536
+
537
+ # Split into training and validation sets
538
+ train_size = int(0.9 * len(dataset))
539
+ val_size = len(dataset) - train_size
540
+
541
+ train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
542
+
543
+ print('{:>5,} training samples'.format(train_size))
544
+ print('{:>5,} validation samples'.format(val_size))
545
+ print(datetime.datetime.now(IST).strftime("%c"))
546
+
547
+ # Create the DataLoaders for our training and validation datasets.
548
+ # We'll take training samples in random order.
549
+ train_dataloader = DataLoader(
550
+ train_dataset, # The training samples.
551
+ sampler = RandomSampler(train_dataset), # Select batches randomly
552
+ batch_size = batch_size # Trains with this batch size.
553
+ )
554
+
555
+ # For validation the order doesn't matter, so we'll just read them sequentially.
556
+ validation_dataloader = DataLoader(
557
+ val_dataset, # The validation samples.
558
+ sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
559
+ batch_size = batch_size # Evaluate with this batch size.
560
+ )
561
+
562
+ # Turning this off
563
+ '''
564
+ # I'm not really doing anything with the config buheret
565
+ configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
566
+
567
+ # instantiate the model
568
+ model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
569
+
570
+ # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
571
+ # otherwise the tokenizer and model tensors won't match up
572
+ model.resize_token_embeddings(len(tokenizer))
573
+
574
+ # Tell pytorch to run this model on the GPU.
575
+ device = torch.device("cuda")
576
+
577
+ model = model.to(device)
578
+ '''
579
+
580
+ print('Model loaded to GPU')
581
+ print(datetime.datetime.now(IST).strftime("%c"))
582
+
583
+ # checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
584
+ # print(model.load_state_dict(checkpoint['state_dict']))
585
+ # del checkpoint
586
+ # tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')
587
+
588
+ # some parameters I cooked up that work reasonably well
589
+
590
+ epochs = 3
591
+ learning_rate = 5e-4
592
+ warmup_steps = 1e2
593
+ epsilon = 1e-8
594
+
595
+ # this produces sample output every 100 steps
596
+ sample_every = 1000
597
+
598
+ # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
599
+ optimizer = AdamW(model.parameters(),
600
+ lr = learning_rate,
601
+ eps = epsilon
602
+ )
603
+
604
+ # Total number of training steps is [number of batches] x [number of epochs].
605
+ # (Note that this is not the same as the number of training samples).
606
+ total_steps = len(train_dataloader) * epochs
607
+
608
+ # Create the learning rate scheduler.
609
+ # This changes the learning rate as the training loop progresses
610
+ scheduler = get_linear_schedule_with_warmup(optimizer,
611
+ num_warmup_steps = warmup_steps,
612
+ num_training_steps = total_steps)
613
+
614
+
615
+
616
+
617
+ def format_time(elapsed):
618
+ return str(datetime.timedelta(seconds=int(round((elapsed)))))
619
+
620
+ output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'
621
+
622
+ # Create output directory if needed
623
+ if not os.path.exists(output_dir):
624
+ os.makedirs(output_dir)
625
+
626
+ total_t0 = time.time()
627
+
628
+ training_stats = []
629
+
630
+ last_epoch, last_step = -1, -1
631
+ try:
632
+ file = open(out_path+'model_save/checkpoint_state.txt','r')
633
+ content = [x.split(':') for x in file.read().split('|')]
634
+ file.close()
635
+ except:
636
+ content = []
637
+
638
+ if len(content) == 2:
639
+ last_epoch = int(content[1][1])
640
+ last_step = int(content[0][1])
641
+
642
+ checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar')
643
+ print(model.load_state_dict(checkpoint['state_dict']))
644
+ tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
645
+ print(datetime.datetime.now(IST).strftime("%c"))
646
+ # else:
647
+ # print(content)
648
+ # input('wait')
649
+
650
+
651
+ for epoch_i in range(0, epochs):
652
+
653
+ # ========================================
654
+ # Training
655
+ # ========================================
656
+
657
+ print("")
658
+ print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
659
+ print('Training...')
660
+
661
+ if last_epoch!=-1:
662
+ if epoch_i < last_epoch:
663
+ continue
664
+
665
+ t0 = time.time()
666
+
667
+ total_train_loss = 0
668
+
669
+ model.train()
670
+
671
+ for step, batch in enumerate(train_dataloader):
672
+
673
+ if last_step != -1:
674
+ if step <= last_step:
675
+ continue
676
+
677
+ b_input_ids = batch[0].to(device)
678
+ b_labels = batch[0].to(device)
679
+ b_masks = batch[1].to(device)
680
+
681
+ model.zero_grad()
682
+
683
+ outputs = model( b_input_ids,
684
+ labels=b_labels,
685
+ attention_mask = b_masks,
686
+ token_type_ids=None
687
+ )
688
+
689
+ loss = outputs[0]
690
+
691
+ batch_loss = loss.item()
692
+ total_train_loss += batch_loss
693
+
694
+ # Get sample every x batches. Ignoring the first step.
695
+ if step % sample_every == 0 and not step == 0:
696
+
697
+ elapsed = format_time(time.time() - t0)
698
+ print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
699
+
700
+ model.eval()
701
+
702
+ sample_outputs = model.generate(
703
+ bos_token_id=random.randint(1,30000),
704
+ do_sample=True,
705
+ top_k=50,
706
+ max_length = 200,
707
+ top_p=0.95,
708
+ num_return_sequences=1
709
+ )
710
+ for i, sample_output in enumerate(sample_outputs):
711
+ print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
712
+
713
+ model.train()
714
+
715
+ torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint.pth.tar')
716
+ torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint.pth.tar')
717
+ file = open(out_path+'model_save/checkpoint_state.txt','w')
718
+ file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
719
+ file.close()
720
+
721
+ loss.backward()
722
+
723
+ optimizer.step()
724
+
725
+ scheduler.step()
726
+
727
+ last_epoch, last_step = -1, -1
728
+ # Calculate the average loss over all of the batches.
729
+ avg_train_loss = total_train_loss / len(train_dataloader)
730
+
731
+ # Measure how long this epoch took.
732
+ training_time = format_time(time.time() - t0)
733
+
734
+ print("")
735
+ print(" Average training loss: {0:.2f}".format(avg_train_loss))
736
+ print(" Training epoch took: {:}".format(training_time))
737
+ print(datetime.datetime.now(IST).strftime("%c"))
738
+
739
+ # ========================================
740
+ # Validation
741
+ # ========================================
742
+
743
+ print("")
744
+ print("Running Validation...")
745
+
746
+ t0 = time.time()
747
+
748
+ model.eval()
749
+
750
+ total_eval_loss = 0
751
+ nb_eval_steps = 0
752
+
753
+ # Evaluate data for one epoch
754
+ for batch in validation_dataloader:
755
+
756
+ b_input_ids = batch[0].to(device)
757
+ b_labels = batch[0].to(device)
758
+ b_masks = batch[1].to(device)
759
+
760
+ with torch.no_grad():
761
+
762
+ outputs = model(b_input_ids,
763
+ # token_type_ids=None,
764
+ attention_mask = b_masks,
765
+ labels=b_labels)
766
+
767
+ loss = outputs[0]
768
+
769
+ batch_loss = loss.item()
770
+ total_eval_loss += batch_loss
771
+
772
+ avg_val_loss = total_eval_loss / len(validation_dataloader)
773
+
774
+ validation_time = format_time(time.time() - t0)
775
+
776
+ print(" Validation Loss: {0:.2f}".format(avg_val_loss))
777
+ print(" Validation took: {:}".format(validation_time))
778
+
779
+ # Record all statistics from this epoch.
780
+ training_stats.append(
781
+ {
782
+ 'epoch': epoch_i + 1,
783
+ 'Training Loss': avg_train_loss,
784
+ 'Valid. Loss': avg_val_loss,
785
+ 'Training Time': training_time,
786
+ 'Validation Time': validation_time
787
+ }
788
+ )
789
+
790
+ print("")
791
+ print("Training complete!")
792
+ print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
793
+ print(datetime.datetime.now(IST).strftime("%c"))
794
+
795
+ # Display floats with two decimal places.
796
+ pd.set_option('precision', 2)
797
+
798
+ # Create a DataFrame from our training statistics.
799
+ df_stats = pd.DataFrame(data=training_stats)
800
+
801
+ # Use the 'epoch' as the row index.
802
+ df_stats = df_stats.set_index('epoch')
803
+
804
+ # A hack to force the column headers to wrap.
805
+ # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
806
+
807
+ # Display the table.
808
+ print(df_stats)
809
+
810
+ # Use plot styling from seaborn.
811
+ sns.set(style='darkgrid')
812
+
813
+ # Increase the plot size and font size.
814
+ sns.set(font_scale=1.5)
815
+ plt.rcParams["figure.figsize"] = (12,6)
816
+
817
+ # Plot the learning curve.
818
+ plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
819
+ plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
820
+
821
+ # Label the plot.
822
+ plt.title("Training & Validation Loss")
823
+ plt.xlabel("Epoch")
824
+ plt.ylabel("Loss")
825
+ plt.legend()
826
+ plt.xticks([1, 2, 3, 4])
827
+
828
+ # plt.show()
829
+ plt.savefig(out_path+"training.png")
830
+
831
+ # Get all of the model's parameters as a list of tuples.
832
+ params = list(model.named_parameters())
833
+
834
+ print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))
835
+
836
+ print('==== Embedding Layer ====\n')
837
+
838
+ for p in params[0:2]:
839
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
840
+
841
+ print('\n==== First Transformer ====\n')
842
+
843
+ for p in params[2:14]:
844
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
845
+
846
+ print('\n==== Output Layer ====\n')
847
+
848
+ for p in params[-2:]:
849
+ print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
850
+
851
+ # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
852
+
853
+ print("Saving model to %s" % output_dir)
854
+
855
+ # Save a trained model, configuration and tokenizer using `save_pretrained()`.
856
+ # They can then be reloaded using `from_pretrained()`
857
+ # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
858
+ # way 1
859
+ model.save_pretrained(output_dir)
860
+ tokenizer.save_pretrained(output_dir)
861
+
862
+ # way 2
863
+ # torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar')
864
+
865
+
866
+ # Loading
867
+
868
+ # way 1
869
+ # model = model.from_pretrained(output_dir).to(device)
870
+ # tokenizer = tokenizer.from_pretrained(output_dir)
871
+
872
+ # way 2
873
+ # checkpoint = torch.load(out_path+'model_save/final_checkpoint.pth.tar')
874
+ # print(model.load_state_dict(checkpoint['state_dict']))
875
+ # del checkpoint
876
+ # tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
877
+
878
+
879
+ print('Model and tokenizer loaded!')
880
+ print(datetime.datetime.now(IST).strftime("%c"))
881
+
882
+ model.eval()
883
+
884
+ prompt = "<|startoftext|> I wish to say that"
885
+
886
+ generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
887
+ generated = generated.to(device)
888
+
889
+ print(generated)
890
+
891
+ sample_outputs = model.generate(
892
+ generated,
893
+ # bos_token_id=random.randint(1,30000),
894
+ do_sample=True,
895
+ top_k=50,
896
+ max_length = 500,
897
+ top_p=0.95,
898
+ num_return_sequences=3
899
+ )
900
+
901
+ for i, sample_output in enumerate(sample_outputs):
902
+ print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
903
+
904
+ print(datetime.datetime.now(IST).strftime("%c"))
code/gpt-run.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import datetime
4
+
5
+ import pandas as pd
6
+ import seaborn as sns
7
+ import numpy as np
8
+ import random
9
+
10
+ import matplotlib.pyplot as plt
11
+
12
+ import torch
13
+ from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
14
+
15
+
16
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
17
+ from transformers import AdamW, get_linear_schedule_with_warmup
18
+
19
+ import sys
20
+
21
+ import pytz
22
+ IST = pytz.timezone('Asia/Kolkata')
23
+ print(datetime.datetime.now(IST).strftime("%c"))
24
+
25
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
26
+
27
+ # I'm not really doing anything with the config buheret
28
+ configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
29
+
30
+ # instantiate the model
31
+ model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
32
+
33
+ # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
34
+ # otherwise the tokenizer and model tensors won't match up
35
+ model.resize_token_embeddings(len(tokenizer))
36
+
37
+ # Tell pytorch to run this model on the GPU.
38
+ device = torch.device("cuda")
39
+
40
+ model = model.to(device)
41
+
42
+ print('Model loaded to GPU')
43
+ print(datetime.datetime.now(IST).strftime("%c"))
44
+
45
+ output_dir = '/media/data_dump/Ritwik/ggpt/model_save/pytorch_save_files/'
46
+
47
+ print('Loading fine-tuned weights')
48
+ model = model.from_pretrained(output_dir).to(device)
49
+ tokenizer = tokenizer.from_pretrained(output_dir)
50
+
51
+ print('Model and tokenizer loaded!')
52
+ print(datetime.datetime.now(IST).strftime("%c"))
53
+
54
+ model.eval()
55
+
56
+ # prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that','<|startoftext|> I wanted to save bhagat singh but','<|startoftext|> I wanted to save bhagat singh but fortunately','<|startoftext|> I wanted to save bhagat singh but unfortunately','<|startoftext|> Reporter: What is your biggest fear? Gandhi:','<|startoftext|> Question) What is your biggest fear?','<|startoftext|> Regarding Muslims and Islam I strongly believe that','<|startoftext|> I wish to say that','<|startoftext|> I chose Nehru over Patel for Prime Minister because','<|startoftext|> During my experiments with truth I observed that','<|startoftext|> My opinion on the negroes of Africa is that']
57
+ prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that']
58
+
59
+ for prompt in prompt_list:
60
+
61
+ # prompt = "<|startoftext|> Regarding Kashmir I am very confident to say that"
62
+
63
+ print(prompt)
64
+
65
+ generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
66
+ generated = generated.to(device)
67
+
68
+ print(generated)
69
+
70
+ sample_outputs = model.generate(
71
+ generated,
72
+ # bos_token_id=random.randint(1,30000),
73
+ do_sample=True,
74
+ top_k=50,
75
+ max_length = 500,
76
+ top_p=0.95,
77
+ num_return_sequences=3
78
+ )
79
+
80
+ for i, sample_output in enumerate(sample_outputs):
81
+ print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
82
+
83
+ print(datetime.datetime.now(IST).strftime("%c"))
84
+ print('\n')
85
+
code/myocr.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ this file is responsible for scraping the gandhi text
3
+ '''
4
+
5
+ import pytesseract
6
+ from pytesseract import Output
7
+ from PIL import Image
8
+ import pandas as pd
9
+ from tqdm import tqdm
10
+ import os.path
11
+
12
+ import fitz
13
+
14
+ import subprocess
15
+
16
+ def do_indent(df):
17
+ text = ""
18
+ # clean up blanks
19
+ df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
20
+ # sort blocks vertically
21
+ sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
22
+ for block in sorted_blocks:
23
+ curr = df1[df1['block_num']==block]
24
+ sel = curr[curr.text.str.len()>3]
25
+ char_w = (sel.width/sel.text.str.len()).mean()
26
+ prev_par, prev_line, prev_left = 0, 0, 0
27
+ # text = ''
28
+ for ix, ln in curr.iterrows():
29
+ # add new line when necessary
30
+ if prev_par != ln['par_num']:
31
+ text += '\n'
32
+ prev_par = ln['par_num']
33
+ prev_line = ln['line_num']
34
+ prev_left = 0
35
+ elif prev_line != ln['line_num']:
36
+ text += '\n'
37
+ prev_line = ln['line_num']
38
+ prev_left = 0
39
+
40
+ added = 0 # num of spaces that should be added
41
+ if ln['left']/char_w > prev_left + 1:
42
+ added = int((ln['left'])/char_w) - prev_left
43
+ text += ' ' * added
44
+ text += ln['text'] + ' '
45
+ prev_left += len(ln['text']) + added + 1
46
+ text += '\n'
47
+ return text
48
+
49
+ text_file_path = 'text_files/'
50
+ start_page = 0
51
+
52
+ for h in range(1,99):
53
+ tfile = text_file_path+str(h)+'.txt'
54
+ url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf"
55
+ bashCommand = "wget "+url +" -O file.pdf"
56
+ process = subprocess.Popen(bashCommand.split())
57
+ output, error = process.communicate()
58
+
59
+ pdffile = "file.pdf"
60
+ doc = fitz.open(pdffile)
61
+ # https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
62
+ file_text = ""
63
+
64
+ for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'):
65
+ if i < start_page:
66
+ continue
67
+ page = doc.load_page(i) # number of page
68
+ mat = fitz.Matrix(5, 5) # zoom factor
69
+ pix = page.get_pixmap(matrix=mat)
70
+ output = "outfile.png"
71
+ pix.save(output)
72
+ custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
73
+ d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT)
74
+ df = pd.DataFrame(d)
75
+ file_text += do_indent(df)
76
+
77
+ f = open(tfile,'w')
78
+ f.write(file_text)
79
+ f.close()
80
+
81
+
82
+
code/outfile.png ADDED