# Fintune GPT2 using HuggingFace & PyTorch

In [1]:
!pip install --quiet transformers==4.2.2

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m880.6/880.6 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.9/2.9 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


Based off of [Philipp Schmid's](https://www.philschmid.de/philipp-schmid) [notebook](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb#scrollTo=laDp891gO25V) with data from the [Trump Twitter Archive](https://www.thetrumparchive.com/?results=1).

- GPT2 [Model Card](https://huggingface.co/gpt2)
-[HuggingFace's Finetuning Docs](https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt)

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json
from transformers import (
    TextDataset,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoModelWithLMHead,
    get_linear_schedule_with_warmup,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import torch
from pathlib import Path

In [3]:
model_name = "gpt2"

if model_name  == "gpt2":
  model_size = "124M"
elif model_name  == "gpt2-medium":
  model_size = "355M"
elif model_name  == "gpt2-large":
  model_size = "774M"
elif model_name  == "gpt2-xl":
  model_size = "1.5B"

In [4]:
# define some params for model
max_length = 100
batch_size = 32
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# produce sample output every 100 steps
sample_every = 100

## Fetch / Load Data & Preprocess

In [5]:
tweets_path = Path("./data/tweets.json")
train_path = Path("./data/train_tweets.csv")
dev_path = Path("./data/dev_tweets.csv")

# fetch data if !exists already
if not tweets_path.exists():
  !mkdir data
  !wget -O ./data/tweets.json "https://drive.google.com/uc?export=download&id=16wm-2NTKohhcA26w-kaWfhLIGwl_oX95"

if not (train_path.exists() and dev_path.exists()):
    with open(tweets_path, 'rb') as f:
        # read json file into dict and then parse into df
        as_dict = json.loads(f.read())
        df = pd.DataFrame(as_dict)
    
    # filter df by !retweet
    df = df[df['isRetweet'] == "f"]

    # filter df to only text
    def is_multimedia(tweet: str):
        if tweet.startswith('https://t.co/'):
            return "t"
        else:
            return "f"

    df['isMultimedia'] = df['text'].apply(lambda x : is_multimedia(x))
    df = df[df['isMultimedia'] == "f"]
    df = df.reset_index(drop=True)

    # filter tweets to remove 'amp;'
    def remove_amp(tweet):
        tweet = tweet.replace('amp;', '')
        tweet = tweet.replace('amp', '')
        return tweet
    df['text'] = df['text'].apply(lambda x: remove_amp(x))

    # rename 'text' column to 'labels'
    # df = df.rename(columns={'text': 'labels'})
        
    # create train, validation splits
    train_data, dev_data = train_test_split(df[['text']], test_size=0.15) 
    
    train_data.to_csv(train_path, index=False, header=None)
    dev_data.to_csv(dev_path, index=False, header=None)

--2023-04-23 20:45:15--  https://drive.google.com/uc?export=download&id=16wm-2NTKohhcA26w-kaWfhLIGwl_oX95
Resolving drive.google.com (drive.google.com)... 108.177.126.113, 108.177.126.138, 108.177.126.102, ...
Connecting to drive.google.com (drive.google.com)|108.177.126.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-0c-04-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/5h7rho54blq770420uluq9enivf39nuc/1682282700000/12919236576798385522/*/16wm-2NTKohhcA26w-kaWfhLIGwl_oX95?e=download&uuid=82f91d2d-5c4e-47b4-9acb-d781f8f5f78c [following]
--2023-04-23 20:45:21--  https://doc-0c-04-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/5h7rho54blq770420uluq9enivf39nuc/1682282700000/12919236576798385522/*/16wm-2NTKohhcA26w-kaWfhLIGwl_oX95?e=download&uuid=82f91d2d-5c4e-47b4-9acb-d781f8f5f78c
Resolving doc-0c-04-docs.googleusercontent.com (doc-0c-04-docs.googleusercontent.com)... 108.177.126.132, 

In [6]:
# create tokenized datasets
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    pad_token='<|endoftext|>'
)

# custom load_dataset function because there are no labels
def load_dataset(train_path, dev_path, tokenizer):
    block_size = 128
    # block_size = tokenizer.model_max_length
    
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=block_size)
     
    dev_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=dev_path,
          block_size=block_size)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, dev_dataset, data_collator

train_dataset, dev_dataset, data_collator = load_dataset(train_path, dev_path, tokenizer)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1394877 > 1024). Running this sequence through the model will result in indexing errors


## Finetune Model

In [7]:
# model = GPT2LMHeadModel.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained(model_name, cache_dir=Path('cache').resolve())

# necessary because of additional bos, eos, pad tokens to embeddings
model.resize_token_embeddings(len(tokenizer))

# create optimizer and learning rate schedule 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

training_steps = len(train_dataset) * epochs

# adjust learning rate during training
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = training_steps)



Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [8]:
training_args = TrainingArguments(
    output_dir=f"./{model_name}-{model_size}-trump",
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_steps = 400, # n update steps between two evaluations
    save_steps=800, # n steps per model save 
    warmup_steps=500, # n warmup steps for learning rate scheduler
    remove_unused_columns=False,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

In [9]:
# train & save model run
trainer.train()
trainer.save_model()

Step,Training Loss
500,3.8523


Step,Training Loss
500,3.8523
1000,3.4087
1500,3.2508


## Generate tweets

In [10]:
trump = pipeline("text-generation", model=f"./{model_name}-{model_size}-trump", tokenizer=tokenizer, config={"max_length":140})

In [11]:
# give Trump a prompt
trump('The democrats have')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The democrats have no respect for the people of this Country.‚Äù  @CynthiaLM\nThe GOP should move quickly to fix ObamaCare and defund ObamaCare fast!\n"""""@nathalie_k: @realDonaldTrump you\'ll'}]

In [12]:
trump('Why does the lying news media')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Why does the lying news media destroy credibility in the House?"\n@TrentBaxter8  Thanks--a great book.\n"With all due respect to @BarackObama, the great United States Military is not yet fully operational and is'}]

In [13]:
trump("Today I'll be")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Today I\'ll be on @foxandfriends today live from Palm Beach... #CelebrityApprentice\n"....I am a winner, but also a winner with a great future. In this case, a great future for Americans at large!'}]