added code files

Browse files

Files changed (6) hide show

code/data_preprocessing.py +198 -0
code/file.pdf +3 -0
code/gpt-finetune.py +904 -0
code/gpt-run.py +85 -0
code/myocr.py +82 -0
code/outfile.png +0 -0

code/data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import re, glob, string
+import math
+from tqdm import tqdm
+from transformers import AutoTokenizer
+import torch
+tokenizer = AutoTokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
+from nltk.tokenize import sent_tokenize
+# ----------------------------- Cleaning process 1/2 -----------------------------
+def sanitize(line):
+	# print('before', line)
+	line2 = re.sub(r'\[.+\]','',line)
+	# print('after',line2)
+	for a in ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]:
+		line2 = line2.replace(a,'')
+	line2 = re.sub(r'\b[A-Z]+\b','',line2.strip())
+	line2 = re.sub(r'\d','',line2)
+	line2 = line2.translate(str.maketrans('','',"‟“’❝❞‚‘‛❛❜❟â")) #just removed the quotes
+	line2 = line2.translate(str.maketrans('','',string.punctuation))
+	line2 = re.sub(r'\s+',' ',line2).strip()
+	return line2
+def remove_footnotes_and_clean(sents):
+	sents = [x.replace("'",'').replace('*','').replace('’®','').replace('’','') for x in sents]
+	s = ''
+	for line in sents:
+		try:
+			if line.strip()[-1] != '-':
+				s = s + line.strip() + ' '
+			else:
+				s = s + line.strip()
+		except:
+			print(sents)
+			input()
+	s = re.sub(r'\s+',' ',s)
+	return s
+path = 'text_files/'
+ml = sorted(glob.glob(path+'*.txt'))
+show = False
+path = 'clean_text_files/'
+for k,m in enumerate(tqdm(ml, total=len(ml), ncols=100)):
+	# m = ml[-1]
+	# if k < 67:
+	# 	continue
+	file = open(m,'r')
+	content = file.readlines()
+	file.close()
+	if show:
+		print(m)
+	paras = []
+	sents = []
+	mean_spaces = []
+	footnote_found = False
+	for line in content:
+		line2 = sanitize(line)
+		if re.search(r'^\W\s\w',line.strip()):
+			footnote_found = True
+		if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip():
+			# new page
+			footnote_found = False
+		if len(line2) > 5 and len(line2.split()) > 4 and footnote_found==False:
+			if show:
+				print(line.rstrip(),end='')
+			li_spaces = len(line) - len(line.strip())
+			if show:
+				print(li_spaces)
+			mean_spaces.append(li_spaces)
+			# input()
+	mean_spaces = math.floor(sum(mean_spaces)/len(mean_spaces))
+	if show:
+		print('ms',mean_spaces)
+		print(' '*mean_spaces+'^')
+	footnote_found = False
+	last_spaces = -1
+	i = 0
+	while i < len(content)-1:
+		# line2 = re.sub(r'[A-Z]','',line.strip())
+		# line2 = re.sub(r'\[\w+\]','',line2)
+		line = content[i]
+		li_spaces = len(line) - len(line.strip())
+		if re.search(r'^\W\s\w',line.strip()):
+			footnote_found = True
+		if re.search(r'^VOL.*\d\d\d\d.*\d$',line.strip()) or 'THE COLLECTED WORKS OF MAHATMA GANDHI' in line.strip():
+			# new page
+			footnote_found = False
+			i+=1
+			# print('--',line.rstrip())
+			continue
+		if footnote_found == False:
+			if not (li_spaces > mean_spaces):
+				# when the spaces in current line is equal or one tab shy from the mean spaces
+				line2 = sanitize(line)
+				if len(line2) > 5 and len(line2.split()) > 4:
+					if show:
+						print('++',line.rstrip())
+					sents.append(line)
+					last_spaces = li_spaces
+				elif last_spaces == li_spaces:
+					if show:
+						print('++',line.rstrip())
+					sents.append(line)
+				else:
+					last_spaces = -1
+					if show:
+						print('--',line.rstrip())
+			else:
+				# the current line has more or less spaces as compared to the mean
+				next_line = content[i+1]
+				lj_spaces = len(next_line) - len(next_line.strip())
+				if not (lj_spaces > mean_spaces):
+					# print('b4', line)
+					line1 = sanitize(content[i])
+					line2 = sanitize(next_line)
+					# print('now',line2)
+					if len(line1) > 5 and len(line1.split()) > 4 and len(line2) > 5 and len(line2.split()) > 4:
+						sent_text = remove_footnotes_and_clean(sents)
+						paras.append(sent_text)
+						if show:
+							print('++',line.rstrip(),'<------NEW PARA')
+						sents = [line]
+						# print('$$',paras[-1])
+					else:
+						last_spaces = -1
+						if show:
+							print('--',line.rstrip())
+				else:
+					last_spaces = -1
+					if show:
+						print('--',line.rstrip())
+		else:
+			last_spaces = -1
+			if show:
+				print('--',line.rstrip())
+		if show:
+			input('wait')
+		i+=1
+	file = open(path+m.split('/')[-1],'w')
+	file.write('\n'.join(paras[1:]))
+	file.close()
+	# input('here wait')
+# ----------------------------- Cleaning process 2/2 -----------------------------
+path = 'clean_text_files/'
+ml = sorted(glob.glob(path+'*.txt'))
+text = []
+for m in tqdm(range(1,99)):
+	file = open(path+str(m)+'.txt','r')
+	text += file.readlines()
+	file.close()
+file = open('all_paras.txt','w')
+file.write(''.join(text))
+file.close()
+sents = []
+tcsents = [] # transformer compatible sents
+para_stack = []
+for para in tqdm(text):
+	para = para.strip()
+	sents += sent_tokenize(para)
+	para_stack = [para] + para_stack
+	while len(para_stack)!=0:
+		top_para = para_stack.pop(0)
+		if len(tokenizer('<|startoftext|>'+ top_para + '<|endoftext|>')['input_ids']) > 200: # <-------------
+			ts = sent_tokenize(top_para)
+			if len(ts) > 1:
+				para_stack = [' '.join(ts[int(len(ts)/2):])] + para_stack # second half
+				para_stack = [' '.join(ts[:int(len(ts)/2)])] + para_stack # first half
+			else:
+				tcsents.append(top_para)
+		else:
+			tcsents.append(top_para)
+file = open('all_sents.txt','w')
+file.write('\n'.join(sents))
+file.close()
+file = open('all_tc_sents_200.txt','w')
+file.write('\n'.join(tcsents))
+file.close()

code/file.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2a57a58bc338df0a10eb28d73efe347d820bdd58a271b1f032562c8a857aa2
+size 1112205

code/gpt-finetune.py ADDED Viewed

	@@ -0,0 +1,904 @@

+import os
+import time
+import datetime
+import pandas as pd
+import seaborn as sns
+import numpy as np
+import random
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
+from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
+from transformers import AdamW, get_linear_schedule_with_warmup
+import nltk
+nltk.download('punkt')
+import sys
+import pytz
+IST = pytz.timezone('Asia/Kolkata')
+stamp = datetime.datetime.now(IST).strftime("%c")
+print('\n')
+print('='*100)
+print('='*100)
+print('\t\t=Experiment6=',stamp)
+print('='*100)
+print('='*100)
+out_path = '/media/data_dump/Ritwik/ggpt/'
+# for i in range(10):
+#     print(i)
+#     time.sleep(1)
+# exit()
+hyper_params = {'rseed': 123}
+import torch, numpy as np, random, transformers, psutil, time
+os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed'])
+# Torch RNG
+torch.manual_seed(hyper_params['rseed'])
+torch.cuda.manual_seed(hyper_params['rseed'])
+torch.cuda.manual_seed_all(hyper_params['rseed'])
+# Python RNG
+np.random.seed(hyper_params['rseed'])
+random.seed(hyper_params['rseed'])
+transformers.set_seed(hyper_params['rseed'])
+# Load the GPT tokenizer.
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
+sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt'
+print(sfile)
+file = open(sfile,'r')
+lines = file.readlines()
+file.close()
+lines = [[x.strip()] for x in lines]
+df = pd.DataFrame(lines, columns=['bio_main'])
+print('Dataframe created')
+df.dropna(inplace=True) #remove NA values
+bios = df.bio_main.copy()
+print(datetime.datetime.now(IST).strftime("%c"))
+# doc_lengths = []
+# for bio in bios:
+#     # get rough token count distribution
+#     tokens = nltk.word_tokenize(bio)
+#     doc_lengths.append(len(tokens))
+# doc_lengths = np.array(doc_lengths)
+# a = sns.distplot(doc_lengths)
+# a.get_figure().savefig(out_path+"out.png")
+# print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
+# print('np.average(doc_lengths)',np.average(doc_lengths))
+# print(datetime.datetime.now(IST).strftime("%c"))
+print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
+print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
+print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
+print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
+print(datetime.datetime.now(IST).strftime("%c"))
+batch_size = 8
+class GPT2Dataset(Dataset):
+  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
+    self.tokenizer = tokenizer
+    self.max_length = max_length
+    # self.input_ids = []
+    # self.attn_masks = []
+    self.sents = list(txt_list)
+    # for txt in txt_list:
+    #   ###self.sents.append(txt)
+    #   encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
+    #   self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
+    #   self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
+  def __len__(self):
+    # return len(self.input_ids)
+    return len(self.sents)
+  def __getitem__(self, idx):
+    # return self.input_ids[idx], self.attn_masks[idx]
+    txt = self.sents[idx]
+    encodings_dict = self.tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=self.max_length, padding="max_length")
+    input_ids = torch.tensor(encodings_dict['input_ids'])
+    attn_masks = torch.tensor(encodings_dict['attention_mask'])
+    return input_ids, attn_masks
+dataset = GPT2Dataset(bios, tokenizer, max_length=500)
+# temp_dataloader = DataLoader(
+#             dataset,  # The training samples.
+#             sampler = RandomSampler(dataset), # Select batches randomly
+#             batch_size = batch_size # Trains with this batch size.
+#         )
+# for temp in temp_dataloader:
+#     print(temp)
+#     print(temp[0].shape)
+#     input()
+# Split into training and validation sets
+train_size = int(0.9 * len(dataset))
+val_size = len(dataset) - train_size
+train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+print('{:>5,} training samples'.format(train_size))
+print('{:>5,} validation samples'.format(val_size))
+print(datetime.datetime.now(IST).strftime("%c"))
+# Create the DataLoaders for our training and validation datasets.
+# We'll take training samples in random order.
+train_dataloader = DataLoader(
+            train_dataset,  # The training samples.
+            sampler = RandomSampler(train_dataset), # Select batches randomly
+            batch_size = batch_size # Trains with this batch size.
+        )
+# For validation the order doesn't matter, so we'll just read them sequentially.
+validation_dataloader = DataLoader(
+            val_dataset, # The validation samples.
+            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
+            batch_size = batch_size # Evaluate with this batch size.
+        )
+# I'm not really doing anything with the config buheret
+configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
+# instantiate the model
+model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
+# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
+# otherwise the tokenizer and model tensors won't match up
+model.resize_token_embeddings(len(tokenizer))
+# Tell pytorch to run this model on the GPU.
+device = torch.device("cuda")
+model = model.to(device)
+print('Model loaded to GPU')
+print(datetime.datetime.now(IST).strftime("%c"))
+# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
+# print(model.load_state_dict(checkpoint['state_dict']))
+# del checkpoint
+# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')
+# some parameters I cooked up that work reasonably well
+epochs = 1
+learning_rate = 5e-4
+warmup_steps = 1e2
+epsilon = 1e-8
+# this produces sample output every 100 steps
+sample_every = 1000
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+optimizer = AdamW(model.parameters(),
+                  lr = learning_rate,
+                  eps = epsilon
+                )
+# Total number of training steps is [number of batches] x [number of epochs].
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+# Create the learning rate scheduler.
+# This changes the learning rate as the training loop progresses
+scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = warmup_steps,
+                                            num_training_steps = total_steps)
+def format_time(elapsed):
+    return str(datetime.timedelta(seconds=int(round((elapsed)))))
+output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+total_t0 = time.time()
+training_stats = []
+last_epoch, last_step = -1, -1
+try:
+    file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r')
+    content = [x.split(':') for x in file.read().split('|')]
+    file.close()
+except:
+    content = []
+if len(content) == 2:
+    last_epoch = int(content[1][1])
+    last_step = int(content[0][1])
+    checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar')
+    print(model.load_state_dict(checkpoint['state_dict']))
+    tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
+    print(datetime.datetime.now(IST).strftime("%c"))
+# else:
+#     print(content)
+#     input('wait')
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+    if last_epoch!=-1:
+        if epoch_i < last_epoch:
+            continue
+    t0 = time.time()
+    total_train_loss = 0
+    model.train()
+    for step, batch in enumerate(train_dataloader):
+        if last_step != -1:
+            if step <= last_step:
+                continue
+        b_input_ids = batch[0].to(device)
+        b_labels = batch[0].to(device)
+        b_masks = batch[1].to(device)
+        model.zero_grad()
+        outputs = model(  b_input_ids,
+                          labels=b_labels,
+                          attention_mask = b_masks,
+                          token_type_ids=None
+                        )
+        loss = outputs[0]
+        batch_loss = loss.item()
+        total_train_loss += batch_loss
+        # Get sample every x batches. Ignoring the first step.
+        if step % sample_every == 0 and not step == 0:
+            elapsed = format_time(time.time() - t0)
+            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
+            model.eval()
+            sample_outputs = model.generate(
+                                    bos_token_id=random.randint(1,30000),
+                                    do_sample=True,
+                                    top_k=50,
+                                    max_length = 200,
+                                    top_p=0.95,
+                                    num_return_sequences=1
+                                )
+            for i, sample_output in enumerate(sample_outputs):
+                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
+            model.train()
+            try:
+                torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint_pretraining.pth.tar')
+                torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
+                file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
+                file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
+                file.close()
+            except:
+                torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint_pretraining.pth.tar')
+                torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
+                file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
+                file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
+                file.close()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+    last_epoch, last_step = -1, -1
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(training_time))
+    print(datetime.datetime.now(IST).strftime("%c"))
+    # ========================================
+    #               Validation
+    # ========================================
+    print("")
+    print("Running Validation...")
+    t0 = time.time()
+    model.eval()
+    total_eval_loss = 0
+    nb_eval_steps = 0
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        b_input_ids = batch[0].to(device)
+        b_labels = batch[0].to(device)
+        b_masks = batch[1].to(device)
+        with torch.no_grad():
+            outputs  = model(b_input_ids,
+                        #    token_type_ids=None,
+                             attention_mask = b_masks,
+                            labels=b_labels)
+            loss = outputs[0]
+        batch_loss = loss.item()
+        total_eval_loss += batch_loss
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    validation_time = format_time(time.time() - t0)
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+print("")
+print("Training complete!")
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+print(datetime.datetime.now(IST).strftime("%c"))
+try:
+    # Display floats with two decimal places.
+    pd.set_option('precision', 2)
+    # Create a DataFrame from our training statistics.
+    df_stats = pd.DataFrame(data=training_stats)
+    # Use the 'epoch' as the row index.
+    df_stats = df_stats.set_index('epoch')
+    # A hack to force the column headers to wrap.
+    # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+    # Display the table.
+    print(df_stats)
+    # Use plot styling from seaborn.
+    sns.set(style='darkgrid')
+    # Increase the plot size and font size.
+    sns.set(font_scale=1.5)
+    plt.rcParams["figure.figsize"] = (12,6)
+    # Plot the learning curve.
+    plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
+    plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
+    # Label the plot.
+    plt.title("Training & Validation Loss")
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss")
+    plt.legend()
+    plt.xticks([1, 2, 3, 4])
+    # plt.show()
+    plt.savefig(out_path+"training.png")
+    # Get all of the model's parameters as a list of tuples.
+    params = list(model.named_parameters())
+    print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))
+    print('==== Embedding Layer ====\n')
+    for p in params[0:2]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+    print('\n==== First Transformer ====\n')
+    for p in params[2:14]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+    print('\n==== Output Layer ====\n')
+    for p in params[-2:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    print("Saving model to %s" % output_dir)
+    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+    # They can then be reloaded using `from_pretrained()`
+    # model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+    # way 1
+    model.save_pretrained(output_dir)
+    tokenizer.save_pretrained(output_dir)
+    # way 2
+    # torch.save({'state_dict': model.state_dict()},  out_path+'model_save/final_checkpoint.pth.tar')
+except Exception as e:
+    print(e)
+    print('Waiting for 10 seconds')
+    time.sleep(10)
+# ========================= Gandhi Data =======================
+sfile = 'all_tc_sents_768.txt'
+print(sfile)
+file = open(sfile,'r')
+lines = file.readlines()
+file.close()
+lines = [[x.strip()] for x in lines]
+df = pd.DataFrame(lines, columns=['bio_main'])
+print('Dataframe created')
+df.dropna(inplace=True) #remove NA values
+bios = df.bio_main.copy()
+doc_lengths = []
+for bio in bios:
+    # get rough token count distribution
+    tokens = nltk.word_tokenize(bio)
+    doc_lengths.append(len(tokens))
+doc_lengths = np.array(doc_lengths)
+a = sns.distplot(doc_lengths)
+a.get_figure().savefig(out_path+"out.png")
+print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
+print('np.average(doc_lengths)',np.average(doc_lengths))
+print(datetime.datetime.now(IST).strftime("%c"))
+print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
+print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
+print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
+print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
+print(datetime.datetime.now(IST).strftime("%c"))
+batch_size = 4
+class GPT2Dataset(Dataset):
+  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
+    self.tokenizer = tokenizer
+    self.input_ids = []
+    self.attn_masks = []
+    for txt in txt_list:
+      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
+      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
+      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
+  def __len__(self):
+    return len(self.input_ids)
+  def __getitem__(self, idx):
+    return self.input_ids[idx], self.attn_masks[idx]
+dataset = GPT2Dataset(bios, tokenizer, max_length=768)
+# Split into training and validation sets
+train_size = int(0.9 * len(dataset))
+val_size = len(dataset) - train_size
+train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+print('{:>5,} training samples'.format(train_size))
+print('{:>5,} validation samples'.format(val_size))
+print(datetime.datetime.now(IST).strftime("%c"))
+# Create the DataLoaders for our training and validation datasets.
+# We'll take training samples in random order.
+train_dataloader = DataLoader(
+            train_dataset,  # The training samples.
+            sampler = RandomSampler(train_dataset), # Select batches randomly
+            batch_size = batch_size # Trains with this batch size.
+        )
+# For validation the order doesn't matter, so we'll just read them sequentially.
+validation_dataloader = DataLoader(
+            val_dataset, # The validation samples.
+            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
+            batch_size = batch_size # Evaluate with this batch size.
+        )
+# Turning this off
+'''
+# I'm not really doing anything with the config buheret
+configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
+# instantiate the model
+model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
+# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
+# otherwise the tokenizer and model tensors won't match up
+model.resize_token_embeddings(len(tokenizer))
+# Tell pytorch to run this model on the GPU.
+device = torch.device("cuda")
+model = model.to(device)
+'''
+print('Model loaded to GPU')
+print(datetime.datetime.now(IST).strftime("%c"))
+# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
+# print(model.load_state_dict(checkpoint['state_dict']))
+# del checkpoint
+# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')
+# some parameters I cooked up that work reasonably well
+epochs = 3
+learning_rate = 5e-4
+warmup_steps = 1e2
+epsilon = 1e-8
+# this produces sample output every 100 steps
+sample_every = 1000
+# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+optimizer = AdamW(model.parameters(),
+                  lr = learning_rate,
+                  eps = epsilon
+                )
+# Total number of training steps is [number of batches] x [number of epochs].
+# (Note that this is not the same as the number of training samples).
+total_steps = len(train_dataloader) * epochs
+# Create the learning rate scheduler.
+# This changes the learning rate as the training loop progresses
+scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = warmup_steps,
+                                            num_training_steps = total_steps)
+def format_time(elapsed):
+    return str(datetime.timedelta(seconds=int(round((elapsed)))))
+output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+total_t0 = time.time()
+training_stats = []
+last_epoch, last_step = -1, -1
+try:
+    file = open(out_path+'model_save/checkpoint_state.txt','r')
+    content = [x.split(':') for x in file.read().split('|')]
+    file.close()
+except:
+    content = []
+if len(content) == 2:
+    last_epoch = int(content[1][1])
+    last_step = int(content[0][1])
+    checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar')
+    print(model.load_state_dict(checkpoint['state_dict']))
+    tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
+    print(datetime.datetime.now(IST).strftime("%c"))
+# else:
+#     print(content)
+#     input('wait')
+for epoch_i in range(0, epochs):
+    # ========================================
+    #               Training
+    # ========================================
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+    if last_epoch!=-1:
+        if epoch_i < last_epoch:
+            continue
+    t0 = time.time()
+    total_train_loss = 0
+    model.train()
+    for step, batch in enumerate(train_dataloader):
+        if last_step != -1:
+            if step <= last_step:
+                continue
+        b_input_ids = batch[0].to(device)
+        b_labels = batch[0].to(device)
+        b_masks = batch[1].to(device)
+        model.zero_grad()
+        outputs = model(  b_input_ids,
+                          labels=b_labels,
+                          attention_mask = b_masks,
+                          token_type_ids=None
+                        )
+        loss = outputs[0]
+        batch_loss = loss.item()
+        total_train_loss += batch_loss
+        # Get sample every x batches. Ignoring the first step.
+        if step % sample_every == 0 and not step == 0:
+            elapsed = format_time(time.time() - t0)
+            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
+            model.eval()
+            sample_outputs = model.generate(
+                                    bos_token_id=random.randint(1,30000),
+                                    do_sample=True,
+                                    top_k=50,
+                                    max_length = 200,
+                                    top_p=0.95,
+                                    num_return_sequences=1
+                                )
+            for i, sample_output in enumerate(sample_outputs):
+                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
+            model.train()
+            torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint.pth.tar')
+            torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint.pth.tar')
+            file = open(out_path+'model_save/checkpoint_state.txt','w')
+            file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
+            file.close()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+    last_epoch, last_step = -1, -1
+    # Calculate the average loss over all of the batches.
+    avg_train_loss = total_train_loss / len(train_dataloader)
+    # Measure how long this epoch took.
+    training_time = format_time(time.time() - t0)
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(training_time))
+    print(datetime.datetime.now(IST).strftime("%c"))
+    # ========================================
+    #               Validation
+    # ========================================
+    print("")
+    print("Running Validation...")
+    t0 = time.time()
+    model.eval()
+    total_eval_loss = 0
+    nb_eval_steps = 0
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        b_input_ids = batch[0].to(device)
+        b_labels = batch[0].to(device)
+        b_masks = batch[1].to(device)
+        with torch.no_grad():
+            outputs  = model(b_input_ids,
+                        #    token_type_ids=None,
+                             attention_mask = b_masks,
+                            labels=b_labels)
+            loss = outputs[0]
+        batch_loss = loss.item()
+        total_eval_loss += batch_loss
+    avg_val_loss = total_eval_loss / len(validation_dataloader)
+    validation_time = format_time(time.time() - t0)
+    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
+    print("  Validation took: {:}".format(validation_time))
+    # Record all statistics from this epoch.
+    training_stats.append(
+        {
+            'epoch': epoch_i + 1,
+            'Training Loss': avg_train_loss,
+            'Valid. Loss': avg_val_loss,
+            'Training Time': training_time,
+            'Validation Time': validation_time
+        }
+    )
+print("")
+print("Training complete!")
+print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
+print(datetime.datetime.now(IST).strftime("%c"))
+# Display floats with two decimal places.
+pd.set_option('precision', 2)
+# Create a DataFrame from our training statistics.
+df_stats = pd.DataFrame(data=training_stats)
+# Use the 'epoch' as the row index.
+df_stats = df_stats.set_index('epoch')
+# A hack to force the column headers to wrap.
+# df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])
+# Display the table.
+print(df_stats)
+# Use plot styling from seaborn.
+sns.set(style='darkgrid')
+# Increase the plot size and font size.
+sns.set(font_scale=1.5)
+plt.rcParams["figure.figsize"] = (12,6)
+# Plot the learning curve.
+plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
+plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
+# Label the plot.
+plt.title("Training & Validation Loss")
+plt.xlabel("Epoch")
+plt.ylabel("Loss")
+plt.legend()
+plt.xticks([1, 2, 3, 4])
+# plt.show()
+plt.savefig(out_path+"training.png")
+# Get all of the model's parameters as a list of tuples.
+params = list(model.named_parameters())
+print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))
+print('==== Embedding Layer ====\n')
+for p in params[0:2]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+print('\n==== First Transformer ====\n')
+for p in params[2:14]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+print('\n==== Output Layer ====\n')
+for p in params[-2:]:
+    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+print("Saving model to %s" % output_dir)
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+# way 1
+model.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
+# way 2
+# torch.save({'state_dict': model.state_dict()},  out_path+'model_save/final_checkpoint.pth.tar')
+# Loading
+# way 1
+# model = model.from_pretrained(output_dir).to(device)
+# tokenizer = tokenizer.from_pretrained(output_dir)
+# way 2
+# checkpoint = torch.load(out_path+'model_save/final_checkpoint.pth.tar')
+# print(model.load_state_dict(checkpoint['state_dict']))
+# del checkpoint
+# tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
+print('Model and tokenizer loaded!')
+print(datetime.datetime.now(IST).strftime("%c"))
+model.eval()
+prompt = "<|startoftext|> I wish to say that"
+generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
+generated = generated.to(device)
+print(generated)
+sample_outputs = model.generate(
+                                generated,
+                                # bos_token_id=random.randint(1,30000),
+                                do_sample=True,
+                                top_k=50,
+                                max_length = 500,
+                                top_p=0.95,
+                                num_return_sequences=3
+                                )
+for i, sample_output in enumerate(sample_outputs):
+  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
+print(datetime.datetime.now(IST).strftime("%c"))

code/gpt-run.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import time
+import datetime
+import pandas as pd
+import seaborn as sns
+import numpy as np
+import random
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
+from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
+from transformers import AdamW, get_linear_schedule_with_warmup
+import sys
+import pytz
+IST = pytz.timezone('Asia/Kolkata')
+print(datetime.datetime.now(IST).strftime("%c"))
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium
+# I'm not really doing anything with the config buheret
+configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
+# instantiate the model
+model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
+# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
+# otherwise the tokenizer and model tensors won't match up
+model.resize_token_embeddings(len(tokenizer))
+# Tell pytorch to run this model on the GPU.
+device = torch.device("cuda")
+model = model.to(device)
+print('Model loaded to GPU')
+print(datetime.datetime.now(IST).strftime("%c"))
+output_dir = '/media/data_dump/Ritwik/ggpt/model_save/pytorch_save_files/'
+print('Loading fine-tuned weights')
+model = model.from_pretrained(output_dir).to(device)
+tokenizer = tokenizer.from_pretrained(output_dir)
+print('Model and tokenizer loaded!')
+print(datetime.datetime.now(IST).strftime("%c"))
+model.eval()
+# prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that','<|startoftext|> I wanted to save bhagat singh but','<|startoftext|> I wanted to save bhagat singh but fortunately','<|startoftext|> I wanted to save bhagat singh but unfortunately','<|startoftext|> Reporter: What is your biggest fear? Gandhi:','<|startoftext|> Question) What is your biggest fear?','<|startoftext|> Regarding Muslims and Islam I strongly believe that','<|startoftext|> I wish to say that','<|startoftext|> I chose Nehru over Patel for Prime Minister because','<|startoftext|> During my experiments with truth I observed that','<|startoftext|> My opinion on the negroes of Africa is that']
+prompt_list = ['<|startoftext|> Regarding Kashmir I am very confident to say that']
+for prompt in prompt_list:
+  # prompt = "<|startoftext|> Regarding Kashmir I am very confident to say that"
+  print(prompt)
+  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
+  generated = generated.to(device)
+  print(generated)
+  sample_outputs = model.generate(
+                                  generated,
+                                  # bos_token_id=random.randint(1,30000),
+                                  do_sample=True,
+                                  top_k=50,
+                                  max_length = 500,
+                                  top_p=0.95,
+                                  num_return_sequences=3
+                                  )
+  for i, sample_output in enumerate(sample_outputs):
+    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
+  print(datetime.datetime.now(IST).strftime("%c"))
+  print('\n')

code/myocr.py ADDED Viewed

	@@ -0,0 +1,82 @@

+'''
+this file is responsible for scraping the gandhi text
+'''
+import pytesseract
+from pytesseract import Output
+from PIL import Image
+import pandas as pd
+from tqdm import tqdm
+import os.path
+import fitz
+import subprocess
+def do_indent(df):
+	text = ""
+	# clean up blanks
+	df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
+	# sort blocks vertically
+	sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
+	for block in sorted_blocks:
+	    curr = df1[df1['block_num']==block]
+	    sel = curr[curr.text.str.len()>3]
+	    char_w = (sel.width/sel.text.str.len()).mean()
+	    prev_par, prev_line, prev_left = 0, 0, 0
+	    # text = ''
+	    for ix, ln in curr.iterrows():
+	        # add new line when necessary
+	        if prev_par != ln['par_num']:
+	            text += '\n'
+	            prev_par = ln['par_num']
+	            prev_line = ln['line_num']
+	            prev_left = 0
+	        elif prev_line != ln['line_num']:
+	            text += '\n'
+	            prev_line = ln['line_num']
+	            prev_left = 0
+	        added = 0  # num of spaces that should be added
+	        if ln['left']/char_w > prev_left + 1:
+	            added = int((ln['left'])/char_w) - prev_left
+	            text += ' ' * added
+	        text += ln['text'] + ' '
+	        prev_left += len(ln['text']) + added + 1
+	    text += '\n'
+	return text
+text_file_path = 'text_files/'
+start_page = 0
+for h in range(1,99):
+    tfile = text_file_path+str(h)+'.txt'
+    url = "http://www.gandhiashramsevagram.org/gandhi-literature/mahatma-gandhi-collected-works-volume-"+str(h)+".pdf"
+    bashCommand = "wget "+url +" -O file.pdf"
+    process = subprocess.Popen(bashCommand.split())
+    output, error = process.communicate()
+    pdffile = "file.pdf"
+    doc = fitz.open(pdffile)
+    # https://stackoverflow.com/questions/46184239/extract-a-page-from-a-pdf-as-a-jpeg
+    file_text = ""
+    for i in tqdm(range(len(doc)), total=len(doc), desc=str(h)+'/98'):
+        if i < start_page:
+            continue
+        page = doc.load_page(i)  # number of page
+        mat = fitz.Matrix(5, 5) # zoom factor
+        pix = page.get_pixmap(matrix=mat)
+        output = "outfile.png"
+        pix.save(output)
+        custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
+        d = pytesseract.image_to_data(Image.open(output), config=custom_config, output_type=Output.DICT)
+        df = pd.DataFrame(d)
+        file_text += do_indent(df)
+    f = open(tfile,'w')
+    f.write(file_text)
+    f.close()

code/outfile.png ADDED Viewed