import os import time import datetime import pandas as pd import seaborn as sns import numpy as np import random import matplotlib.pyplot as plt import torch from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel from transformers import AdamW, get_linear_schedule_with_warmup import nltk nltk.download('punkt') import sys import pytz IST = pytz.timezone('Asia/Kolkata') stamp = datetime.datetime.now(IST).strftime("%c") print('\n') print('='*100) print('='*100) print('\t\t=Experiment6=',stamp) print('='*100) print('='*100) out_path = '/media/data_dump/Ritwik/ggpt/' # for i in range(10): # print(i) # time.sleep(1) # exit() hyper_params = {'rseed': 123} import torch, numpy as np, random, transformers, psutil, time os.environ['PYTHONHASHSEED'] = str(hyper_params['rseed']) # Torch RNG torch.manual_seed(hyper_params['rseed']) torch.cuda.manual_seed(hyper_params['rseed']) torch.cuda.manual_seed_all(hyper_params['rseed']) # Python RNG np.random.seed(hyper_params['rseed']) random.seed(hyper_params['rseed']) transformers.set_seed(hyper_params['rseed']) # Load the GPT tokenizer. tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium sfile = '/media/nas_mount/Ritwik/Ai4Bharat_text_corpora/data/en/en_clean.txt' print(sfile) file = open(sfile,'r') lines = file.readlines() file.close() lines = [[x.strip()] for x in lines] df = pd.DataFrame(lines, columns=['bio_main']) print('Dataframe created') df.dropna(inplace=True) #remove NA values bios = df.bio_main.copy() print(datetime.datetime.now(IST).strftime("%c")) # doc_lengths = [] # for bio in bios: # # get rough token count distribution # tokens = nltk.word_tokenize(bio) # doc_lengths.append(len(tokens)) # doc_lengths = np.array(doc_lengths) # a = sns.distplot(doc_lengths) # a.get_figure().savefig(out_path+"out.png") # print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths)) # print('np.average(doc_lengths)',np.average(doc_lengths)) # print(datetime.datetime.now(IST).strftime("%c")) print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length)) print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id)) print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id)) print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id)) print(datetime.datetime.now(IST).strftime("%c")) batch_size = 8 class GPT2Dataset(Dataset): def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768): self.tokenizer = tokenizer self.max_length = max_length # self.input_ids = [] # self.attn_masks = [] self.sents = list(txt_list) # for txt in txt_list: # ###self.sents.append(txt) # encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length") # self.input_ids.append(torch.tensor(encodings_dict['input_ids'])) # self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'])) def __len__(self): # return len(self.input_ids) return len(self.sents) def __getitem__(self, idx): # return self.input_ids[idx], self.attn_masks[idx] txt = self.sents[idx] encodings_dict = self.tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=self.max_length, padding="max_length") input_ids = torch.tensor(encodings_dict['input_ids']) attn_masks = torch.tensor(encodings_dict['attention_mask']) return input_ids, attn_masks dataset = GPT2Dataset(bios, tokenizer, max_length=500) # temp_dataloader = DataLoader( # dataset, # The training samples. # sampler = RandomSampler(dataset), # Select batches randomly # batch_size = batch_size # Trains with this batch size. # ) # for temp in temp_dataloader: # print(temp) # print(temp[0].shape) # input() # Split into training and validation sets train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) print('{:>5,} training samples'.format(train_size)) print('{:>5,} validation samples'.format(val_size)) print(datetime.datetime.now(IST).strftime("%c")) # Create the DataLoaders for our training and validation datasets. # We'll take training samples in random order. train_dataloader = DataLoader( train_dataset, # The training samples. sampler = RandomSampler(train_dataset), # Select batches randomly batch_size = batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler = SequentialSampler(val_dataset), # Pull out batches sequentially. batch_size = batch_size # Evaluate with this batch size. ) # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # instantiate the model model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings # otherwise the tokenizer and model tensors won't match up model.resize_token_embeddings(len(tokenizer)) # Tell pytorch to run this model on the GPU. device = torch.device("cuda") model = model.to(device) print('Model loaded to GPU') print(datetime.datetime.now(IST).strftime("%c")) # checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar') # print(model.load_state_dict(checkpoint['state_dict'])) # del checkpoint # tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/') # some parameters I cooked up that work reasonably well epochs = 1 learning_rate = 5e-4 warmup_steps = 1e2 epsilon = 1e-8 # this produces sample output every 100 steps sample_every = 1000 # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon ) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # This changes the learning rate as the training loop progresses scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps) def format_time(elapsed): return str(datetime.timedelta(seconds=int(round((elapsed))))) output_dir = '/media/data_dump/Ritwik/ggpt/model_save/' # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) total_t0 = time.time() training_stats = [] last_epoch, last_step = -1, -1 try: file = open(out_path+'model_save/checkpoint_state_pretraining.txt','r') content = [x.split(':') for x in file.read().split('|')] file.close() except: content = [] if len(content) == 2: last_epoch = int(content[1][1]) last_step = int(content[0][1]) checkpoint = torch.load(out_path+'model_save/model_checkpoint_pretraining.pth.tar') print(model.load_state_dict(checkpoint['state_dict'])) tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') print(datetime.datetime.now(IST).strftime("%c")) # else: # print(content) # input('wait') for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') if last_epoch!=-1: if epoch_i < last_epoch: continue t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if last_step != -1: if step <= last_step: continue b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) model.zero_grad() outputs = model( b_input_ids, labels=b_labels, attention_mask = b_masks, token_type_ids=None ) loss = outputs[0] batch_loss = loss.item() total_train_loss += batch_loss # Get sample every x batches. Ignoring the first step. if step % sample_every == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed)) model.eval() sample_outputs = model.generate( bos_token_id=random.randint(1,30000), do_sample=True, top_k=50, max_length = 200, top_p=0.95, num_return_sequences=1 ) for i, sample_output in enumerate(sample_outputs): print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) model.train() try: torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar') torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w') file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) file.close() except: torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint_pretraining.pth.tar') torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint_pretraining.pth.tar') file = open(out_path+'model_save/checkpoint_state_pretraining.txt','w') file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) file.close() loss.backward() optimizer.step() scheduler.step() last_epoch, last_step = -1, -1 # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) print(datetime.datetime.now(IST).strftime("%c")) # ======================================== # Validation # ======================================== print("") print("Running Validation...") t0 = time.time() model.eval() total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) with torch.no_grad(): outputs = model(b_input_ids, # token_type_ids=None, attention_mask = b_masks, labels=b_labels) loss = outputs[0] batch_loss = loss.item() total_eval_loss += batch_loss avg_val_loss = total_eval_loss / len(validation_dataloader) validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append( { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Training Time': training_time, 'Validation Time': validation_time } ) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) print(datetime.datetime.now(IST).strftime("%c")) try: # Display floats with two decimal places. pd.set_option('precision', 2) # Create a DataFrame from our training statistics. df_stats = pd.DataFrame(data=training_stats) # Use the 'epoch' as the row index. df_stats = df_stats.set_index('epoch') # A hack to force the column headers to wrap. # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])]) # Display the table. print(df_stats) # Use plot styling from seaborn. sns.set(style='darkgrid') # Increase the plot size and font size. sns.set(font_scale=1.5) plt.rcParams["figure.figsize"] = (12,6) # Plot the learning curve. plt.plot(df_stats['Training Loss'], 'b-o', label="Training") plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation") # Label the plot. plt.title("Training & Validation Loss") plt.xlabel("Epoch") plt.ylabel("Loss") plt.legend() plt.xticks([1, 2, 3, 4]) # plt.show() plt.savefig(out_path+"training.png") # Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) print('The GPT-2 model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:2]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== First Transformer ====\n') for p in params[2:14]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== Output Layer ====\n') for p in params[-2:]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() print("Saving model to %s" % output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # way 1 model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # way 2 # torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar') except Exception as e: print(e) print('Waiting for 10 seconds') time.sleep(10) # ========================= Gandhi Data ======================= sfile = 'all_tc_sents_768.txt' print(sfile) file = open(sfile,'r') lines = file.readlines() file.close() lines = [[x.strip()] for x in lines] df = pd.DataFrame(lines, columns=['bio_main']) print('Dataframe created') df.dropna(inplace=True) #remove NA values bios = df.bio_main.copy() doc_lengths = [] for bio in bios: # get rough token count distribution tokens = nltk.word_tokenize(bio) doc_lengths.append(len(tokens)) doc_lengths = np.array(doc_lengths) a = sns.distplot(doc_lengths) a.get_figure().savefig(out_path+"out.png") print('len(doc_lengths[doc_lengths > 768])/len(doc_lengths)',len(doc_lengths[doc_lengths > 768])/len(doc_lengths)) print('np.average(doc_lengths)',np.average(doc_lengths)) print(datetime.datetime.now(IST).strftime("%c")) print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length)) print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id)) print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id)) print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id)) print(datetime.datetime.now(IST).strftime("%c")) batch_size = 4 class GPT2Dataset(Dataset): def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768): self.tokenizer = tokenizer self.input_ids = [] self.attn_masks = [] for txt in txt_list: encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length") self.input_ids.append(torch.tensor(encodings_dict['input_ids'])) self.attn_masks.append(torch.tensor(encodings_dict['attention_mask'])) def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return self.input_ids[idx], self.attn_masks[idx] dataset = GPT2Dataset(bios, tokenizer, max_length=768) # Split into training and validation sets train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) print('{:>5,} training samples'.format(train_size)) print('{:>5,} validation samples'.format(val_size)) print(datetime.datetime.now(IST).strftime("%c")) # Create the DataLoaders for our training and validation datasets. # We'll take training samples in random order. train_dataloader = DataLoader( train_dataset, # The training samples. sampler = RandomSampler(train_dataset), # Select batches randomly batch_size = batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler = SequentialSampler(val_dataset), # Pull out batches sequentially. batch_size = batch_size # Evaluate with this batch size. ) # Turning this off ''' # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # instantiate the model model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings # otherwise the tokenizer and model tensors won't match up model.resize_token_embeddings(len(tokenizer)) # Tell pytorch to run this model on the GPU. device = torch.device("cuda") model = model.to(device) ''' print('Model loaded to GPU') print(datetime.datetime.now(IST).strftime("%c")) # checkpoint = torch.load(out_path+'model_save_768/final_checkpoint.pth.tar') # print(model.load_state_dict(checkpoint['state_dict'])) # del checkpoint # tokenizer = torch.load(out_path+'model_save_768/tokenizer_checkpoint.pth.tar') #.from_pretrained('/media/data_dump/Ritwik/ggpt/model_save_768/') # some parameters I cooked up that work reasonably well epochs = 3 learning_rate = 5e-4 warmup_steps = 1e2 epsilon = 1e-8 # this produces sample output every 100 steps sample_every = 1000 # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon ) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # This changes the learning rate as the training loop progresses scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps) def format_time(elapsed): return str(datetime.timedelta(seconds=int(round((elapsed))))) output_dir = '/media/data_dump/Ritwik/ggpt/model_save/' # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) total_t0 = time.time() training_stats = [] last_epoch, last_step = -1, -1 try: file = open(out_path+'model_save/checkpoint_state.txt','r') content = [x.split(':') for x in file.read().split('|')] file.close() except: content = [] if len(content) == 2: last_epoch = int(content[1][1]) last_step = int(content[0][1]) checkpoint = torch.load(out_path+'model_save/model_checkpoint.pth.tar') print(model.load_state_dict(checkpoint['state_dict'])) tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar') print(datetime.datetime.now(IST).strftime("%c")) # else: # print(content) # input('wait') for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') if last_epoch!=-1: if epoch_i < last_epoch: continue t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if last_step != -1: if step <= last_step: continue b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) model.zero_grad() outputs = model( b_input_ids, labels=b_labels, attention_mask = b_masks, token_type_ids=None ) loss = outputs[0] batch_loss = loss.item() total_train_loss += batch_loss # Get sample every x batches. Ignoring the first step. if step % sample_every == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed)) model.eval() sample_outputs = model.generate( bos_token_id=random.randint(1,30000), do_sample=True, top_k=50, max_length = 200, top_p=0.95, num_return_sequences=1 ) for i, sample_output in enumerate(sample_outputs): print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) model.train() torch.save({'state_dict': model.state_dict()}, out_path+'model_save/model_checkpoint.pth.tar') torch.save(tokenizer, out_path+'model_save/tokenizer_checkpoint.pth.tar') file = open(out_path+'model_save/checkpoint_state.txt','w') file.write('step:'+str(step)+'|epoch:'+str(epoch_i)) file.close() loss.backward() optimizer.step() scheduler.step() last_epoch, last_step = -1, -1 # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) print(datetime.datetime.now(IST).strftime("%c")) # ======================================== # Validation # ======================================== print("") print("Running Validation...") t0 = time.time() model.eval() total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) with torch.no_grad(): outputs = model(b_input_ids, # token_type_ids=None, attention_mask = b_masks, labels=b_labels) loss = outputs[0] batch_loss = loss.item() total_eval_loss += batch_loss avg_val_loss = total_eval_loss / len(validation_dataloader) validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append( { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Training Time': training_time, 'Validation Time': validation_time } ) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) print(datetime.datetime.now(IST).strftime("%c")) # Display floats with two decimal places. pd.set_option('precision', 2) # Create a DataFrame from our training statistics. df_stats = pd.DataFrame(data=training_stats) # Use the 'epoch' as the row index. df_stats = df_stats.set_index('epoch') # A hack to force the column headers to wrap. # df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])]) # Display the table. print(df_stats) # Use plot styling from seaborn. sns.set(style='darkgrid') # Increase the plot size and font size. sns.set(font_scale=1.5) plt.rcParams["figure.figsize"] = (12,6) # Plot the learning curve. plt.plot(df_stats['Training Loss'], 'b-o', label="Training") plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation") # Label the plot. plt.title("Training & Validation Loss") plt.xlabel("Epoch") plt.ylabel("Loss") plt.legend() plt.xticks([1, 2, 3, 4]) # plt.show() plt.savefig(out_path+"training.png") # Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) print('The GPT-2 model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:2]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== First Transformer ====\n') for p in params[2:14]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== Output Layer ====\n') for p in params[-2:]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() print("Saving model to %s" % output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # way 1 model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # way 2 # torch.save({'state_dict': model.state_dict()}, out_path+'model_save/final_checkpoint.pth.tar') # Loading # way 1 # model = model.from_pretrained(output_dir).to(device) # tokenizer = tokenizer.from_pretrained(output_dir) # way 2 # checkpoint = torch.load(out_path+'model_save/final_checkpoint.pth.tar') # print(model.load_state_dict(checkpoint['state_dict'])) # del checkpoint # tokenizer = torch.load(out_path+'model_save/tokenizer_checkpoint.pth.tar') print('Model and tokenizer loaded!') print(datetime.datetime.now(IST).strftime("%c")) model.eval() prompt = "<|startoftext|> I wish to say that" generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) generated = generated.to(device) print(generated) sample_outputs = model.generate( generated, # bos_token_id=random.randint(1,30000), do_sample=True, top_k=50, max_length = 500, top_p=0.95, num_return_sequences=3 ) for i, sample_output in enumerate(sample_outputs): print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) print(datetime.datetime.now(IST).strftime("%c"))