In [1]:
!pip install transformers datasets evaluate rouge_score accelerate



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load PTS dataset

In [3]:
import pandas as pd
from datasets import Dataset

df = pd.read_excel("PTS Dataset.xlsx")
df = pd.DataFrame(df)
dataset = Dataset.from_pandas(df)

In [4]:
dataset = dataset.train_test_split(test_size=0.2)

In [5]:
dataset["train"][0]

{'Text': 'In 2021, the APA issued a public apology for not doing its part to combat systemic racism and hurting communities of color. The statement included a list of resolutions to advocate for social equality across many areas, including education, criminal justice, and research.\nTo address racial inequality in the U.S. education system, the APA resolved to:\nReaffirm that race is a social construct with no biological basis.\nFoster more positive learning environments for people of color.\nEmphasize the importance of teaching the history of racism in schools.\nPromote teacher training to diminish racial biases.\nCall on educational institutions to adopt anti-racist policies.',
 'Summarize': 'The APA apologized in 2021 for not addressing systemic racism and addressing racial inequality in the U.S. education system. Resolutions included reaffirming race as a social construct, promoting positive learning environments, teaching racism history, promoting teacher training, and calling for

# Preprocess

In [6]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def preprocess_function(examples):
    inputs = examples["Text"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["Summarize"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1439 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# Evaluate

In [10]:
import evaluate

rouge = evaluate.load("rouge")

In [11]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Train

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="PTS-Bart-Large-CNN",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.87484,0.6166,0.3827,0.5058,0.5055,77.6583
2,No log,0.877407,0.6307,0.4064,0.5302,0.531,77.5111
3,0.676100,0.906377,0.635,0.4052,0.5309,0.5311,76.2833
4,0.676100,1.038586,0.6329,0.4038,0.5261,0.5262,78.4889
5,0.676100,1.099272,0.6285,0.4016,0.5239,0.5246,77.0083
6,0.201600,1.202453,0.6351,0.4126,0.5351,0.5356,76.0722
7,0.201600,1.239893,0.6356,0.4108,0.5362,0.5368,78.5361
8,0.201600,1.263763,0.6376,0.4143,0.538,0.5387,76.8417


Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=1440, training_loss=0.32941916783650715, metrics={'train_runtime': 2574.9157, 'train_samples_per_second': 4.471, 'train_steps_per_second': 0.559, 'total_flos': 7806107593924608.0, 'train_loss': 0.32941916783650715, 'epoch': 8.0})

In [14]:
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

events.out.tfevents.1718450841.4441297fbb55.7613.0:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmedmbutt/PTS-Bart-Large-CNN/commit/b9ea22455049729a8466c7c45d60e82b72adfe10', commit_message='End of training', commit_description='', oid='b9ea22455049729a8466c7c45d60e82b72adfe10', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [15]:
original_text = dataset['test'][0]['Text']
original_text

'One step toward changing your beliefs is undergoing a process called disputation. Disputation is meant to teach you life-long skills to help you manage your emotional response and overall mental health. During disputation, your therapist will challenge your irrational beliefs using direct methods. They may question your beliefs head-on, causing you to rethink them, or they could ask you to imagine another point of view that you haven’t considered before. '

In [16]:
from transformers import pipeline

summarizer = pipeline("summarization", model="PTS-Bart-Large-CNN")
summarized_text = summarizer(original_text)
summarized_text = summarized_text[0]['summary_text']
summarized_text

Your max_length is set to 142, but your input_length is only 93. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


'Disputation is a process that challenges irrational beliefs by using direct methods and challenging beliefs, teaching life-long skills to manage emotional response and mental health. It involves questioning beliefs head-on or asking for alternative perspectives, challenging irrational beliefs and promoting new perspectives.'