### Data Preparation

In [None]:
# Import pandas for data analysis
import pandas as pd

df  = pd.read_csv("/content/Merged_data_QA.csv")
df.head() #show first five rows

Unnamed: 0,Questions,Answers
0,What does it mean to have a mental illness?,Mental illnesses are health conditions that di...
1,Who does mental illness affect?,It is estimated that mental illness affects 1 ...
2,What causes mental illness?,It is estimated that mental illness affects 1 ...
3,What are some of the warning signs of mental i...,Symptoms of mental health disorders vary depen...
4,Can people with mental illness recover?,"When healing from mental illness, early identi..."


In [None]:
df.shape # 118 rows | 2 cols

(118, 2)

In [None]:
len(df.Questions)

118

In [None]:
len(df.Answers)

118

In [None]:
!pip install cleantext

In [None]:
# Function to clean text data by removing unwanted characters and formatting
import cleantext

def clean(textdata):
  '''Use the cleantext library to remove extra spaces, lowercase the text,
        and remove numbers and punctuation'''
    cleaned_text = []
    for i in textdata:
        cleaned_text.append(cleantext.clean(str(i), extra_spaces=True, lowercase=True, stopwords=False, stemming=False, numbers=True, punct=True, clean_all = True))

    return cleaned_text

In [None]:
# Apply the clean function to the questions and answers columns

df.Questions = list(clean(df.Questions))
df.Answers = list(clean(df.Answers))

In [None]:
# Save the cleaned data into a new CSV file & save
df.to_csv("cleaned_QA_data.csv", index=False)

### GPT-2 Model

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K  

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch

# Load the GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Set the padding token for the tokenizer to be the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Maximum sequence length that GPT-2 can handle
max_length = tokenizer.model_max_length
print(max_length)

1024


In [None]:
from datasets import load_dataset

# Load the cleaned QA dataset as a training set using the 'datasets' library
dataset = load_dataset('csv', data_files={'train': 'cleaned_QA_data.csv'}, split='train')

In [None]:
#Function to tokenize questions and answers and prepare them for the model
def tokenize_function(examples):

  '''1. Combine each question and answer into a single input string
     2. Tokenize the combined text using the GPT-2 tokenizer
     3. Set the labels to be the same as the input_ids (shifted to predict the next word)
     4. Return the tokenized output. '''

    combined_text = [str(q) + " " + str(a) for q, a in zip(examples['Questions'], examples['Answers'])]
    tokenized_output = tokenizer(combined_text, padding='max_length', truncation=True, max_length=128)

    tokenized_output['labels'] = tokenized_output['input_ids'].copy()

    return tokenized_output

Map:   0%|          | 0/118 [00:00<?, ? examples/s]

In [None]:
# Tokenize the entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Define training arguments for the GPT-2 model
training_args = TrainingArguments(
    output_dir='./results', # Directory to save model outputs
    num_train_epochs=50, # Train for 50 epochs
    per_device_train_batch_size=16, # Batch size during training
    per_device_eval_batch_size=32, # Batch size during evaluation
    warmup_steps=500, # Warmup steps for learning rate scheduler
    weight_decay=0.01, # Weight decay for regularization
    logging_dir='./logs', # Directory for saving logs
    logging_steps=10, # Log every 10 steps
    save_steps=1000, # Save the model every 1000 steps
)

Step,Training Loss
10,5.2195
20,4.857
30,4.4655
40,3.9377
50,3.427
60,3.1875
70,2.8692
80,2.853
90,2.6567
100,2.5813


TrainOutput(global_step=400, training_loss=1.9328853976726532, metrics={'train_runtime': 206.7338, 'train_samples_per_second': 28.539, 'train_steps_per_second': 1.935, 'total_flos': 385405747200000.0, 'train_loss': 1.9328853976726532, 'epoch': 50.0})

In [None]:
# Trainer class to handle training process
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model('counsel_model')

In [7]:
# Function to generate a response based on a user prompt (testing the model)
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode the generated output
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the prompt from the response
    if response.startswith(prompt):
        response = response[len(prompt):].strip()  # Remove the prompt from the response

    return response


In [9]:
# Example conversation
user_input = "How can I become a Data Scientist?"
bot_response = generate_response(user_input)
print("Bot Response:", bot_response)

Bot Response: Data scientists are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained to be data scientists. They are trained


In [None]:
# Copying the model to Google Drive (optional)
import shutil

# Path to the file in Colab
colab_file_path = '/content/counsel_model/model.safetensors'

# Path to your Google Drive
drive_file_path = '/content/drive/MyDrive'

# Copy the file
shutil.copy(colab_file_path, drive_file_path)

'/content/drive/MyDrive/model.safetensors'