# Fintune GPT2 using HuggingFace & PyTorch

In [1]:
!pip install --quiet transformers==4.2.2

Based off of [Philipp Schmid's](https://www.philschmid.de/philipp-schmid) [notebook](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb#scrollTo=laDp891gO25V) with data from the [Trump Twitter Archive](https://www.thetrumparchive.com/?results=1).

- GPT2 [Model Card](https://huggingface.co/gpt2)
-[HuggingFace's Finetuning Docs](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt)

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
from transformers import (
    TextDataset,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoModelWithLMHead,
    get_linear_schedule_with_warmup,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import torch
from pathlib import Path

In [3]:
model_name = "gpt2-medium"

if model_name  == "gpt2":
  model_size = "124M"
elif model_name  == "gpt2-medium":
  model_size = "355M"
elif model_name  == "gpt2-large":
  model_size = "774M"
elif model_name  == "gpt2-xl":
  model_size = "1.5B"

In [4]:
# define some params for model
batch_size = 8
epochs = 15
learning_rate = 5e-4
epsilon = 1e-8
warmup_steps = 1e2
sample_every = 100  # produce sample output every 100 steps
max_length = 140  # max length used in generate method of model

## Fetch / Load Data & Preprocess

In [5]:
tweets_path = Path("./data/tweets.json")
train_path = Path("./data/train_tweets.csv")
dev_path = Path("./data/dev_tweets.csv")

# fetch data if !exists already
if not tweets_path.exists():
  !mkdir data
  !wget -O ./data/tweets.json "https://drive.google.com/uc?export=download&id=16wm-2NTKohhcA26w-kaWfhLIGwl_oX95"

if not (train_path.exists() and dev_path.exists()):
    with open(tweets_path, 'rb') as f:
        # read json file into dict and then parse into df
        as_dict = json.loads(f.read())
        df = pd.DataFrame(as_dict)
    
    # filter df by !retweet
    df = df[df['isRetweet'] == "f"]

    # filter df to only text
    def is_multimedia(tweet: str):
        if tweet.startswith('https://t.co/'):
            return "t"
        else:
            return "f"

    df['isMultimedia'] = df['text'].apply(lambda x : is_multimedia(x))
    df = df[df['isMultimedia'] == "f"]
    df = df.reset_index(drop=True)

    # filter tweets to remove 'amp;'
    def remove_amp(tweet):
        tweet = tweet.replace('amp;', '')
        tweet = tweet.replace('amp', '')
        return tweet
    df['text'] = df['text'].apply(lambda x: remove_amp(x))

    # rename 'text' column to 'labels'
    # df = df.rename(columns={'text': 'labels'})
        
    # create train, validation splits
    train_data, dev_data = train_test_split(df[['text']], test_size=0.15) 
    
    train_data.to_csv(train_path, index=False, header=None)
    dev_data.to_csv(dev_path, index=False, header=None)

In [6]:
# create tokenized datasets
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    pad_token='<|endoftext|>'
)

# custom load_dataset function because there are no labels
def load_dataset(train_path, dev_path, tokenizer):
    block_size = 128
    # block_size = tokenizer.model_max_length
    
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=block_size)
     
    dev_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=dev_path,
          block_size=block_size)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, dev_dataset, data_collator

train_dataset, dev_dataset, data_collator = load_dataset(train_path, dev_path, tokenizer)



## Finetune Model

In [7]:
# AutoModelWithLMHead will pick GPT-2 weights from name
model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=Path('cache').resolve())

# necessary because of additional bos, eos, pad tokens to embeddings
model.resize_token_embeddings(len(tokenizer))

# create optimizer and learning rate schedule 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

training_steps = len(train_dataset) * epochs

# adjust learning rate during training
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = training_steps)



In [8]:
training_args = TrainingArguments(
    output_dir=f"./{model_name}-{model_size}-trump",
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_steps = 400, # n update steps between two evaluations
    save_steps=800, # n steps per model save 
    warmup_steps=500, # n warmup steps for learning rate scheduler
    remove_unused_columns=False,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

In [9]:
# train & save model run
trainer.train()
trainer.save_model()

Step,Training Loss
500,3.6227
1000,3.3016
1500,3.1452
2000,2.932
2500,2.925
3000,2.7771
3500,2.6615
4000,2.6681
4500,2.4825
5000,2.4556


Step,Training Loss
500,3.6227
1000,3.3016
1500,3.1452
2000,2.932
2500,2.925
3000,2.7771
3500,2.6615
4000,2.6681
4500,2.4825
5000,2.4556


## Generate tweets

In [10]:
trump = pipeline("text-generation", model=f"./{model_name}-{model_size}-trump", tokenizer=tokenizer, config={"max_length":max_length})

In [None]:
#@title
# give Trump a prompt
result = trump('The democrats have')

In [12]:
trump('Why does the lying news media')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Why does the lying news media refuse to state that Cruz poll numbers, as opposed to others, are the highest of any GOP? He beat @RealBenCarson!"\n"""""Donald Trump to run for PGA Grand regressor"""" http'}]

In [13]:
trump("Today I'll be")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Today I'll be rallying w/ @FEMA, First Responders, Law Enforcement, and First Responders of Puerto Rico to help those most affected by the #IrmaFlood.https://t.co/gsFSghkmdM"}]

In [14]:
trump("The democrats have")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The democrats have made life so difficult for your favorite President and Vice President. Many thousands of jobs have been lost. Would rather make a deal with Russia than play games. Great power for the U.S.A."\n"... and the U'}]