license: afl-3.0
datasets:
- HuggingFaceTB/cosmopedia
metrics:
- accuracy
library_name: adapter-transformers
pipeline_tag: text-classification
tags:
- code
Install the necessary libraries
!pip install transformers !pip install torch
import torch from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification from transformers import Trainer, TrainingArguments from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support
Example dataset for text classification (replace with your own dataset)
texts = [...] # List of input texts labels = [...] # List of corresponding labels (0 or 1 for binary classification)
Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
Define the tokenizer and model for RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base") roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")
Define the tokenizer and model for XLNet
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")
Tokenize and encode the training and testing sets
train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True) test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True)
train_encodings_xlnet = xlnet_tokenizer(train_texts, truncation=True, padding=True) test_encodings_xlnet = xlnet_tokenizer(test_texts, truncation=True, padding=True)
class MyDataset(torch.utils.data.Dataset): def init(self, encodings, labels): self.encodings = encodings self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset_roberta = MyDataset(train_encodings_roberta, train_labels) test_dataset_roberta = MyDataset(test_encodings_roberta, test_labels)
train_dataset_xlnet = MyDataset(train_encodings_xlnet, train_labels) test_dataset_xlnet = MyDataset(test_encodings_xlnet, test_labels)
Fine-tune RoBERTa model
training_args = TrainingArguments( per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, logging_dir='./logs', logging_steps=10, )
trainer_roberta = Trainer( model=roberta_model, args=training_args, train_dataset=train_dataset_roberta, eval_dataset=test_dataset_roberta, )
trainer_roberta.train()
Fine-tune XLNet model
trainer_xlnet = Trainer( model=xlnet_model, args=training_args, train_dataset=train_dataset_xlnet, eval_dataset=test_dataset_xlnet, )
trainer_xlnet.train()
Evaluate models
def evaluate_model(model, test_dataset): predictions = [] labels = [] for batch in test_dataset: input_ids = batch['input_ids'].to(model.device) attention_mask = batch['attention_mask'].to(model.device) labels.extend(batch['labels'].tolist()) with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_mask) logits = outputs.logits predictions.extend(torch.argmax(logits, axis=1).tolist()) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary') return accuracy, precision, recall, f1
accuracy_roberta, precision_roberta, recall_roberta, f1_roberta = evaluate_model(roberta_model, test_dataset_roberta) accuracy_xlnet, precision_xlnet, recall_xlnet, f1_xlnet = evaluate_model(xlnet_model, test_dataset_xlnet)
print("RoBERTa Model Evaluation:") print(f"Accuracy: {accuracy_roberta}") print(f"Precision: {precision_roberta}") print(f"Recall: {recall_roberta}") print(f"F1 Score: {f1_roberta}")
print("\nXLNet Model Evaluation:") print(f"Accuracy: {accuracy_xlnet}") print(f"Precision: {precision_xlnet}") print(f"Recall: {recall_xlnet}") print(f"F1 Score: {f1_xlnet}")