|
import pandas as pd |
|
import torch |
|
from tqdm import tqdm |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import DistilBertTokenizer, DistilBertModel |
|
import os |
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
|
|
|
|
|
MAX_LEN = 512 |
|
TRAIN_BATCH_SIZE = 16 |
|
VALID_BATCH_SIZE = 16 |
|
EPOCHS = 3 |
|
LEARNING_RATE = 1e-05 |
|
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
|
|
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True) |
|
|
|
class MultiLabelDataset(Dataset): |
|
|
|
def __init__(self, dataframe, tokenizer, max_len, new_data=False): |
|
self.tokenizer = tokenizer |
|
self.data = dataframe |
|
self.text = dataframe.comment_text |
|
self.new_data = new_data |
|
|
|
if not new_data: |
|
self.targets = self.data.labels |
|
self.max_len = max_len |
|
|
|
def __len__(self): |
|
return len(self.text) |
|
|
|
def __getitem__(self, index): |
|
text = str(self.text[index]) |
|
text = " ".join(text.split()) |
|
|
|
inputs = self.tokenizer.encode_plus( |
|
text, |
|
None, |
|
add_special_tokens=True, |
|
max_length=self.max_len, |
|
pad_to_max_length=True, |
|
return_token_type_ids=True |
|
) |
|
ids = inputs['input_ids'] |
|
mask = inputs['attention_mask'] |
|
token_type_ids = inputs["token_type_ids"] |
|
|
|
out = { |
|
'ids': torch.tensor(ids, dtype=torch.long), |
|
'mask': torch.tensor(mask, dtype=torch.long), |
|
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), |
|
} |
|
|
|
if not self.new_data: |
|
out['targets'] = torch.tensor(self.targets[index], dtype=torch.float) |
|
|
|
return out |
|
|
|
class DistilBERTClass(torch.nn.Module): |
|
def __init__(self): |
|
super(DistilBERTClass, self).__init__() |
|
|
|
self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased") |
|
self.classifier = torch.nn.Sequential( |
|
torch.nn.Linear(768, 768), |
|
torch.nn.ReLU(), |
|
torch.nn.Dropout(0.1), |
|
torch.nn.Linear(768, 6) |
|
) |
|
|
|
def forward(self, input_ids, attention_mask, token_type_ids): |
|
output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask) |
|
hidden_state = output_1[0] |
|
out = hidden_state[:, 0] |
|
out = self.classifier(out) |
|
return out |
|
|
|
model = DistilBERTClass() |
|
model.to(DEVICE); |
|
|
|
model_loaded = torch.load('inference_models_output_4fold_distilbert_fold_best_model.pth') |
|
|
|
model.load_state_dict(model_loaded['model']) |
|
|
|
|
|
val_params = {'batch_size': VALID_BATCH_SIZE, |
|
'shuffle': False, |
|
|
|
} |
|
def give_toxic(text): |
|
|
|
test_data = pd.DataFrame([text],columns=['comment_text']) |
|
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True) |
|
test_loader = DataLoader(test_set, **val_params) |
|
|
|
all_test_pred = [] |
|
|
|
def test(epoch): |
|
model.eval() |
|
|
|
with torch.inference_mode(): |
|
|
|
for _, data in tqdm(enumerate(test_loader, 0)): |
|
|
|
|
|
ids = data['ids'].to(DEVICE, dtype=torch.long) |
|
mask = data['mask'].to(DEVICE, dtype=torch.long) |
|
token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long) |
|
outputs = model(ids, mask, token_type_ids) |
|
probas = torch.sigmoid(outputs) |
|
|
|
all_test_pred.append(probas) |
|
|
|
probas = test(model) |
|
|
|
all_test_pred = torch.cat(all_test_pred) |
|
|
|
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] |
|
|
|
preds = all_test_pred.detach().cpu().numpy()[0] |
|
|
|
final_dict = dict(zip(label_columns , preds)) |
|
return final_dict |
|
|
|
def device(): |
|
return DEVICE |
|
|
|
print(give_toxic("fuck")) |