File size: 4,271 Bytes
d2a1ee9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
---
license: afl-3.0
datasets:
- HuggingFaceTB/cosmopedia
metrics:
- accuracy
library_name: adapter-transformers
pipeline_tag: text-classification
tags:
- code
---
# Install the necessary libraries
!pip install transformers
!pip install torch
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Example dataset for text classification (replace with your own dataset)
texts = [...] # List of input texts
labels = [...] # List of corresponding labels (0 or 1 for binary classification)
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Define the tokenizer and model for RoBERTa
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaForSequenceClassification.from_pretrained("roberta-base")
# Define the tokenizer and model for XLNet
xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
xlnet_model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased")
# Tokenize and encode the training and testing sets
train_encodings_roberta = roberta_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_roberta = roberta_tokenizer(test_texts, truncation=True, padding=True)
train_encodings_xlnet = xlnet_tokenizer(train_texts, truncation=True, padding=True)
test_encodings_xlnet = xlnet_tokenizer(test_texts, truncation=True, padding=True)
class MyDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset_roberta = MyDataset(train_encodings_roberta, train_labels)
test_dataset_roberta = MyDataset(test_encodings_roberta, test_labels)
train_dataset_xlnet = MyDataset(train_encodings_xlnet, train_labels)
test_dataset_xlnet = MyDataset(test_encodings_xlnet, test_labels)
# Fine-tune RoBERTa model
training_args = TrainingArguments(
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
logging_dir='./logs',
logging_steps=10,
)
trainer_roberta = Trainer(
model=roberta_model,
args=training_args,
train_dataset=train_dataset_roberta,
eval_dataset=test_dataset_roberta,
)
trainer_roberta.train()
# Fine-tune XLNet model
trainer_xlnet = Trainer(
model=xlnet_model,
args=training_args,
train_dataset=train_dataset_xlnet,
eval_dataset=test_dataset_xlnet,
)
trainer_xlnet.train()
# Evaluate models
def evaluate_model(model, test_dataset):
predictions = []
labels = []
for batch in test_dataset:
input_ids = batch['input_ids'].to(model.device)
attention_mask = batch['attention_mask'].to(model.device)
labels.extend(batch['labels'].tolist())
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions.extend(torch.argmax(logits, axis=1).tolist())
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
return accuracy, precision, recall, f1
accuracy_roberta, precision_roberta, recall_roberta, f1_roberta = evaluate_model(roberta_model, test_dataset_roberta)
accuracy_xlnet, precision_xlnet, recall_xlnet, f1_xlnet = evaluate_model(xlnet_model, test_dataset_xlnet)
print("RoBERTa Model Evaluation:")
print(f"Accuracy: {accuracy_roberta}")
print(f"Precision: {precision_roberta}")
print(f"Recall: {recall_roberta}")
print(f"F1 Score: {f1_roberta}")
print("\nXLNet Model Evaluation:")
print(f"Accuracy: {accuracy_xlnet}")
print(f"Precision: {precision_xlnet}")
print(f"Recall: {recall_xlnet}")
print(f"F1 Score: {f1_xlnet}") |