File size: 3,810 Bytes
e1a5a68 5f4c80b e1a5a68 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 1e-05
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
class MultiLabelDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len, new_data=False):
self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.comment_text
self.new_data = new_data
if not new_data:
self.targets = self.data.labels
self.max_len = max_len
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
out = {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
}
if not self.new_data:
out['targets'] = torch.tensor(self.targets[index], dtype=torch.float)
return out
class DistilBERTClass(torch.nn.Module):
def __init__(self):
super(DistilBERTClass, self).__init__()
self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
self.classifier = torch.nn.Sequential(
torch.nn.Linear(768, 768),
torch.nn.ReLU(),
torch.nn.Dropout(0.1),
torch.nn.Linear(768, 6)
)
def forward(self, input_ids, attention_mask, token_type_ids):
output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
out = hidden_state[:, 0]
out = self.classifier(out)
return out
model = DistilBERTClass()
model.to(DEVICE);
model_loaded = torch.load('inference_models_output_4fold_distilbert_fold_best_model.pth')
model.load_state_dict(model_loaded['model'])
val_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
}
def give_toxic(text):
# text = "You fucker "
test_data = pd.DataFrame([text],columns=['comment_text'])
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, **val_params)
all_test_pred = []
def test(epoch):
model.eval()
with torch.inference_mode():
for _, data in tqdm(enumerate(test_loader, 0)):
ids = data['ids'].to(DEVICE, dtype=torch.long)
mask = data['mask'].to(DEVICE, dtype=torch.long)
token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
outputs = model(ids, mask, token_type_ids)
probas = torch.sigmoid(outputs)
all_test_pred.append(probas)
probas = test(model)
all_test_pred = torch.cat(all_test_pred)
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
preds = all_test_pred.detach().cpu().numpy()[0]
final_dict = dict(zip(label_columns , preds))
return final_dict
def device():
return DEVICE
print(give_toxic("fuck")) |