import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler


from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

import sys

import pytz
IST = pytz.timezone('Asia/Kolkata')
stamp = datetime.datetime.now(IST).strftime("%c")

print('\n')
print('='*100)
print('='*100)
print('\t\t=Experiment6=',stamp)
print('='*100)
print('='*100)

out_path = '/media/data_dump/Ritwik/ggpt/'


# for i in range(10):
#     print(i)
#     time.sleep(1)


# exit()

hyper_params = {'rseed': 123}

import torch, numpy as np, random, transformers, psutil, time
os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed'])
# Torch RNG
torch.manual_seed(hyper_params['rseed'])
torch.cuda.manual_seed(hyper_params['rseed'])
torch.cuda.manual_seed_all(hyper_params['rseed'])
# Python RNG
np.random.seed(hyper_params['rseed'])
random.seed(hyper_params['rseed'])
transformers.set_seed(hyper_params['rseed'])

# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium

sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt'
print(sfile)
file = open(sfile,'r')
lines = file.readlines()
file.close()
lines = [[x.strip()] for x in lines]

df = pd.DataFrame(lines, columns=['bio_main'])

print('Dataframe created')
df.dropna(inplace=True) #remove NA values
bios = df.bio_main.copy()
print(datetime.datetime.now(IST).strftime("%c"))

# doc_lengths = []
# for bio in bios:
#     # get rough token count distribution
#     tokens = nltk.word_tokenize(bio)
#     doc_lengths.append(len(tokens))
# doc_lengths = np.array(doc_lengths)
# a = sns.distplot(doc_lengths)
# a.get_figure().savefig(out_path+"out.png") 
# print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
# print('np.average(doc_lengths)',np.average(doc_lengths))
# print(datetime.datetime.now(IST).strftime("%c"))


print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
print(datetime.datetime.now(IST).strftime("%c"))

batch_size = 8

class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.max_length = max_length
    # self.input_ids = []
    # self.attn_masks = []
    self.sents = list(txt_list)

    # for txt in txt_list:
    #   ###self.sents.append(txt)

    #   encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

    #   self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
    #   self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    # return len(self.input_ids)
    return len(self.sents)

  def __getitem__(self, idx):
    # return self.input_ids[idx], self.attn_masks[idx] 
    txt = self.sents[idx]
    encodings_dict = self.tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=self.max_length, padding="max_length")    
    input_ids = torch.tensor(encodings_dict['input_ids'])
    attn_masks = torch.tensor(encodings_dict['attention_mask'])
    return input_ids, attn_masks

dataset = GPT2Dataset(bios, tokenizer, max_length=500)

# temp_dataloader = DataLoader(
#             dataset,  # The training samples.
#             sampler = RandomSampler(dataset), # Select batches randomly
#             batch_size = batch_size # Trains with this batch size.
#         )

# for temp in temp_dataloader:
#     print(temp)
#     print(temp[0].shape)
#     input()

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print(datetime.datetime.now(IST).strftime("%c"))

# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )


# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")

model = model.to(device)

print('Model loaded to GPU')
print(datetime.datetime.now(IST).strftime("%c"))

# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
# print(model.load_state_dict(checkpoint['state_dict']))
# del checkpoint
# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')

# some parameters I cooked up that work reasonably well

epochs = 1
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 1000

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

total_t0 = time.time()

training_stats = []

last_epoch, last_step = -1, -1
try:
    file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r')
    content = [x.split(':') for x in file.read().split('|')]
    file.close()
except:
    content = []

if len(content) == 2:
    last_epoch = int(content[1][1])
    last_step = int(content[0][1])

    checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar')
    print(model.load_state_dict(checkpoint['state_dict']))
    tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
    print(datetime.datetime.now(IST).strftime("%c"))
# else:
#     print(content)
#     input('wait')


for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    if last_epoch!=-1:
        if epoch_i < last_epoch:
            continue

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if last_step != -1:
            if step <= last_step:
                continue

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches. Ignoring the first step.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

            try:
                torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint_pretraining.pth.tar')
                torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
                file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
                file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
                file.close()
            except:
                torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint_pretraining.pth.tar')
                torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar')
                file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w')
                file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
                file.close()

        loss.backward()

        optimizer.step()

        scheduler.step()

    last_epoch, last_step = -1, -1
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    print(datetime.datetime.now(IST).strftime("%c"))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
                        #    token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
print(datetime.datetime.now(IST).strftime("%c"))

try:
    # Display floats with two decimal places.
    pd.set_option('precision', 2)

    # Create a DataFrame from our training statistics.
    df_stats = pd.DataFrame(data=training_stats)

    # Use the 'epoch' as the row index.
    df_stats = df_stats.set_index('epoch')

    # A hack to force the column headers to wrap.
    # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

    # Display the table.
    print(df_stats)

    # Use plot styling from seaborn.
    sns.set(style='darkgrid')

    # Increase the plot size and font size.
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)

    # Plot the learning curve.
    plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
    plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

    # Label the plot.
    plt.title("Training & Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.xticks([1, 2, 3, 4])

    # plt.show()
    plt.savefig(out_path+"training.png")

    # Get all of the model's parameters as a list of tuples.
    params = list(model.named_parameters())

    print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

    print('==== Embedding Layer ====\n')

    for p in params[0:2]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== First Transformer ====\n')

    for p in params[2:14]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    print('\n==== Output Layer ====\n')

    for p in params[-2:]:
        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

    print("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    # model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    # way 1
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    # way 2
    # torch.save({'state_dict': model.state_dict()},  out_path+'model_save/final_checkpoint.pth.tar')

except Exception as e:
    print(e)
    print('Waiting for 10 seconds')
    time.sleep(10)

# ========================= Gandhi Data =======================

sfile = 'all_tc_sents_768.txt'
print(sfile)
file = open(sfile,'r')
lines = file.readlines()
file.close()
lines = [[x.strip()] for x in lines]

df = pd.DataFrame(lines, columns=['bio_main'])

print('Dataframe created')
df.dropna(inplace=True) #remove NA values
bios = df.bio_main.copy()

doc_lengths = []
for bio in bios:
    # get rough token count distribution
    tokens = nltk.word_tokenize(bio)
    doc_lengths.append(len(tokens))
doc_lengths = np.array(doc_lengths)
a = sns.distplot(doc_lengths)
a.get_figure().savefig(out_path+"out.png") 
print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths))
print('np.average(doc_lengths)',np.average(doc_lengths))
print(datetime.datetime.now(IST).strftime("%c"))


print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))
print(datetime.datetime.now(IST).strftime("%c"))

batch_size = 4

class GPT2Dataset(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

dataset = GPT2Dataset(bios, tokenizer, max_length=768)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print(datetime.datetime.now(IST).strftime("%c"))

# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Turning this off
'''
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")

model = model.to(device)
'''

print('Model loaded to GPU')
print(datetime.datetime.now(IST).strftime("%c"))

# checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar')
# print(model.load_state_dict(checkpoint['state_dict']))
# del checkpoint
# tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/')

# some parameters I cooked up that work reasonably well

epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 1000

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

output_dir = '/media/data_dump/Ritwik/ggpt/model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

total_t0 = time.time()

training_stats = []

last_epoch, last_step = -1, -1
try:
    file = open(out_path+'model_save/checkpoint_state.txt','r')
    content = [x.split(':') for x in file.read().split('|')]
    file.close()
except:
    content = []

if len(content) == 2:
    last_epoch = int(content[1][1])
    last_step = int(content[0][1])

    checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar')
    print(model.load_state_dict(checkpoint['state_dict']))
    tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')
    print(datetime.datetime.now(IST).strftime("%c"))
# else:
#     print(content)
#     input('wait')


for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    if last_epoch!=-1:
        if epoch_i < last_epoch:
            continue

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if last_step != -1:
            if step <= last_step:
                continue

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches. Ignoring the first step.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

            torch.save({'state_dict': model.state_dict()},  out_path+'model_save/model_checkpoint.pth.tar')
            torch.save(tokenizer,  out_path+'model_save/tokenizer_checkpoint.pth.tar')
            file = open(out_path+'model_save/checkpoint_state.txt','w')
            file.write('step:'+str(step)+'|epoch:'+str(epoch_i))
            file.close()

        loss.backward()

        optimizer.step()

        scheduler.step()

    last_epoch, last_step = -1, -1
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    print(datetime.datetime.now(IST).strftime("%c"))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
                        #    token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
print(datetime.datetime.now(IST).strftime("%c"))

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
# df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
print(df_stats)

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

# plt.show()
plt.savefig(out_path+"training.png")

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
# way 1
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# way 2
# torch.save({'state_dict': model.state_dict()},  out_path+'model_save/final_checkpoint.pth.tar')


# Loading

# way 1
# model = model.from_pretrained(output_dir).to(device)
# tokenizer = tokenizer.from_pretrained(output_dir)

# way 2
# checkpoint = torch.load(out_path+'model_save/final_checkpoint.pth.tar')
# print(model.load_state_dict(checkpoint['state_dict']))
# del checkpoint
# tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar')


print('Model and tokenizer loaded!')
print(datetime.datetime.now(IST).strftime("%c"))

model.eval()

prompt = "<|startoftext|> I wish to say that"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                # bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 500,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

print(datetime.datetime.now(IST).strftime("%c"))