In [None]:
%pip install transformers
%pip install torch
%pip install pandas
%pip install scikit-learn
%pip install datasets
%pip install evaluate
%pip install tqdm
%pip install openpyxl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollator
from datasets import Dataset, DatasetDict
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [None]:
local_directory = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
data = pd.read_excel('/content/drive/MyDrive/data/sentiment_analysis_data_km.xlsx')
test_df = pd.read_excel('/content/drive/MyDrive/data/test.xlsx')
data['text_prep'] = data['text_prep'].apply(lambda x: ' '.join(re.sub(r'[០១២៣៤៥៦៧៨៩។៕៖ៗ៘៙៚]+', ' ', x).strip().split()))

In [None]:
X = data['text_prep']
y = data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1234, shuffle=True, stratify=y)

In [None]:
data.drop_duplicates(subset=['text_prep'], inplace=True)

In [None]:
train = pd.DataFrame({'text': X_train, 'labels': y_train})
val = pd.DataFrame({'text': X_val, 'labels': y_val})
test = pd.DataFrame({'text': test_df['X_test'] , 'labels': test_df['y_test']})
train.reset_index(inplace=True)
val.reset_index( inplace=True)
test.reset_index( inplace=True)

In [None]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

In [None]:
# Create a DatasetDict
data_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})
data_dict

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 8583
    })
    validation: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 2146
    })
    test: Dataset({
        features: ['index', 'text', 'labels'],
        num_rows: 400
    })
})

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)

In [None]:
#pass the tokenize function to tokenize each sentence in the example
tokenized_datasets = data_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)
tokenized_datasets

Map:   0%|          | 0/8583 [00:00<?, ? examples/s]

Map:   0%|          | 0/2146 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 8583
    })
    validation: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2146
    })
    test: Dataset({
        features: ['index', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['index','text'])
tokenized_datasets = tokenized_datasets.with_format('torch')

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 8583
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2146
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 400
    })
})

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True, collate_fn=data_collator  )
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, collate_fn=data_collator)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(local_directory, num_labels=2,  ignore_mismatched_sizes=True)

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [None]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

2146


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)  # Check the output
model.to(device)

cuda


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [None]:
from tqdm.notebook import tqdm  # Use tqdm.notebook

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/2146 [00:00<?, ?it/s]

In [None]:
import numpy as np
from evaluate import load # Import load instead of load_metric

# Load multiple metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")


model.eval()

all_predictions = []
all_labels = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):  # Use tqdm for progress bar
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Add batch to each metric
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    precision_metric.add_batch(predictions=predictions, references=batch["labels"])
    recall_metric.add_batch(predictions=predictions, references=batch["labels"])
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])

    # Store predictions and labels for further analysis (e.g., confusion matrix)
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(batch["labels"].cpu().numpy())

# Compute and print metrics
accuracy = accuracy_metric.compute()
precision = precision_metric.compute(average="weighted")  # Use weighted average for multi-class
recall = recall_metric.compute(average="weighted")      # Use weighted average for multi-class
f1 = f1_metric.compute(average="weighted")            # Use weighted average for multi-class

print(f"Evaluation Accuracy: {accuracy['accuracy']}")
print(f"Evaluation Precision: {precision['precision']}")
print(f"Evaluation Recall: {recall['recall']}")
print(f"Evaluation F1 Score: {f1['f1']}")


# Further analysis (optional):
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_labels, all_predictions)
print("Confusion Matrix:")
print(cm)

Evaluating:   0%|          | 0/269 [00:00<?, ?it/s]

Evaluation Accuracy: 0.8541472506989748
Evaluation Precision: 0.8553107984468175
Evaluation Recall: 0.8541472506989748
Evaluation F1 Score: 0.85403705456117
Confusion Matrix:
[[945 126]
 [187 888]]


In [None]:
import numpy as np
from evaluate import load
from torch.utils.data import DataLoader

# Load metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

# Assuming you have already created and tokenized your test_dataset from test_df
# If not, you can create it using:
# test_dataset = Dataset.from_pandas(test_df)
# tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
# tokenized_test_dataset = tokenized_test_dataset.remove_columns(['index','text'])
# tokenized_test_dataset = tokenized_test_dataset.with_format('torch')

# Create a DataLoader for the test dataset
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)


model.eval()

all_predictions = []
all_labels = []

for batch in tqdm(test_dataloader, desc="Evaluating on Test Set"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    precision_metric.add_batch(predictions=predictions, references=batch["labels"])
    recall_metric.add_batch(predictions=predictions, references=batch["labels"])
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])

    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(batch["labels"].cpu().numpy())

# Compute and print metrics
accuracy = accuracy_metric.compute()
precision = precision_metric.compute(average="weighted")
recall = recall_metric.compute(average="weighted")
f1 = f1_metric.compute(average="weighted")

print(f"Test Set Accuracy: {accuracy['accuracy']}")
print(f"Test Set Precision: {precision['precision']}")
print(f"Test Set Recall: {recall['recall']}")
print(f"Test Set F1 Score: {f1['f1']}")

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(all_labels, all_predictions)
print("Confusion Matrix:")
print(cm)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Evaluating on Test Set:   0%|          | 0/50 [00:00<?, ?it/s]

Test Set Accuracy: 0.8325
Test Set Precision: 0.8355478766025641
Test Set Recall: 0.8325
Test Set F1 Score: 0.8325177969862311
Confusion Matrix:
[[167  42]
 [ 25 166]]


In [None]:
# model.save_pretrained('/content/drive/MyDrive/models/my_trained_model')

In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/models/my_trained_model')

In [None]:
text = "ខ្ញុំកំពុងរៀនពីកំហុសរបស់ខ្ញុំ និងមានអារម្មណ៍ថាវាជាពិធីបន្ថែមតម្លៃ។"

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
inputs

labels_mapping = {0: 'negative', 1: 'positive'}

outputs = loaded_model(**inputs)
logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1).item()

print(f"Predicted Class: {labels_mapping[predicted_class]}")

Predicted Class: positive
